Esempi in Python per PrioritizedReplayBuffer

Linguaggio di programmazione: Python

Spazio dei nomi/nome del pacchetto: replay_buffer2

Classe/tipologia: PrioritizedReplayBuffer

Esempi su hotexamples.com: 2

PrioritizedReplayBuffer in Python: 2 esempi trovati. Questi sono i migliori esempi reali in Python per replay_buffer2.PrioritizedReplayBuffer, estratti da progetti open source. Li puoi valutare, per aiutarci a migliorare la qualità dei nostri esempi.

Metodi utilizzati di frequente

Mostra Nascondi

PrioritizedReplayBuffer(4)

add(4)

sample(4)

update_priorities(4)

Esempio n. 1

Mostra file

File: blockarrange5.py Progetto: rplatt/generic_text_DEICTIC

def main():

    np.set_printoptions(formatter={'float_kind': lambda x: "%.2f" % x})

    # Define environment
    env = envstandalone.BlockArrange()

    # Dictionary-based value function
    q_func_tabular = {}

    # cols of vectorKey must be boolean less than 64 bits long
    def getTabularKeys(vectorKey):
        obsBits = np.packbits(vectorKey, 1)
        obsKeys = 0
        for i in range(np.shape(obsBits)[1]):
            # IMPORTANT: the number of bits in the type cast below (UINT64) must be at least as big
            # as the bits required to encode obsBits. If it is too small, we get hash collisions...
            obsKeys = obsKeys + (256**i) * np.uint64(obsBits[:, i])
        return obsKeys

    def getTabular(vectorKey):
        keys = getTabularKeys(vectorKey)
        #        return np.array([q_func[x] if x in q_func else 0*np.ones(num_states) for x in keys])
        return np.array([
            q_func_tabular[x] if x in q_func_tabular else 10 *
            np.ones(num_states) for x in keys
        ])

    def trainTabular(vectorKey, qCurrTargets, weights):
        keys = getTabularKeys(vectorKey)
        alpha = 0.2
        for i in range(len(keys)):
            if keys[i] in q_func_tabular:
                #                q_func[keys[i]] = (1-alpha)*q_func[keys[i]] + alpha*qCurrTargets[i]
                q_func_tabular[keys[i]] = q_func_tabular[
                    keys[i]] + alpha * weights[i, :] * (
                        qCurrTargets[i] - q_func_tabular[keys[i]]
                    )  # (1-alpha)*q_func[keys[i]] + alpha*qCurrTargets[i]
            else:
                q_func_tabular[keys[i]] = qCurrTargets[i]

    # Standard DQN parameters


#    max_timesteps=40000
    max_timesteps = 10000
    #    max_timesteps=1000
    learning_starts = 1000
    #    learning_starts=10
    #    buffer_size=50000
    buffer_size = 10000
    #    buffer_size=1000
    #    buffer_size=320
    #    buffer_size=32
    #    buffer_size=8
    #    buffer_size=1
    exploration_fraction = 0.2
    exploration_final_eps = 0.02
    print_freq = 1
    #    gamma=.98
    gamma = .9
    target_network_update_freq = 1
    batch_size = 32
    #    batch_size=1
    train_freq = 1
    #    train_freq=2
    num_cpu = 16
    #    lr=0.001
    lr = 0.0003
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    prioritized_replay = True
    #    prioritized_replay=False
    #    prioritized_replay_alpha=1.0
    prioritized_replay_alpha = 0.6
    prioritized_replay_beta0 = 0.4
    prioritized_replay_beta_iters = None
    #    prioritized_replay_beta_iters=20000
    prioritized_replay_eps = 1e-6
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size,
                                                alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None
    beta = 1

    # Deictic state/action parameters
    deicticShape = (3, 3, 2
                    )  # IMPORTANT: first two elts of deicticShape must be odd
    deicticActionShape = (3, 3, 4)
    num_cascade = 5
    num_states = env.num_blocks + 1  # one more state than blocks to account for not holding anything
    num_patches = env.maxSide**2
    num_actions = 2 * num_patches
    num_actions_discrete = 2
    #    valueFunctionType = "TABULAR"
    valueFunctionType = "DQN"

    # ******* Build tensorflow functions ********

    q_func = models.cnn_to_mlp(
        #    q_func = models.cnn_to_mlp_2pathways(
        convs=[(32, 3, 1)],
        hiddens=[32],
        dueling=True)

    def make_obs_ph(name):
        return U.BatchInput(env.observation_space.spaces[0].shape, name=name)

    def make_actionDeic_ph(name):
        return U.BatchInput(deicticActionShape, name=name)

    def make_target_ph(name):
        #        return U.BatchInput([num_actions], name=name)
        #        return U.BatchInput([num_cascade,num_states], name=name)
        return U.BatchInput([num_states], name=name)

    def make_weight_ph(name):
        return U.BatchInput([num_states], name=name)

    getMoveActionDescriptors = build_getMoveActionDescriptors(
        make_obs_ph=make_obs_ph, deicticShape=deicticShape)

    getq = build_getq(make_actionDeic_ph=make_actionDeic_ph,
                      q_func=q_func,
                      num_states=num_states,
                      num_cascade=num_cascade,
                      scope="deepq",
                      qscope="q_func")

    targetTrain = build_targetTrain(
        make_actionDeic_ph=make_actionDeic_ph,
        make_target_ph=make_target_ph,
        make_weight_ph=make_weight_ph,
        q_func=q_func,
        num_states=num_states,
        num_cascade=num_cascade,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        #        optimizer=tf.train.GradientDescentOptimizer(learning_rate=lr),
        scope="deepq",
        qscope="q_func",
        grad_norm_clipping=1.
        #        grad_norm_clipping=0.1
    )

    # Start tensorflow session
    sess = U.make_session(num_cpu)
    sess.__enter__()

    episode_rewards = [0.0]
    timerStart = time.time()
    U.initialize()
    obs = env.reset()
    for t in range(max_timesteps):

        # Get state: in range(0,env.num_blocks)
        stateDeictic = obs[1]  # holding

        # Get action set: <num_patches> pick actions followed by <num_patches> place actions
        moveDescriptors = getMoveActionDescriptors([obs[0]])
        actionsPickDescriptors = np.concatenate(
            [moveDescriptors,
             np.zeros(np.shape(moveDescriptors))], axis=3)
        actionsPlaceDescriptors = np.concatenate(
            [np.zeros(np.shape(moveDescriptors)), moveDescriptors], axis=3)
        actionDescriptors = np.r_[actionsPickDescriptors,
                                  actionsPlaceDescriptors]

        if valueFunctionType == "TABULAR":
            actionDescriptorsFlat = np.reshape(actionDescriptors, [
                -1, deicticActionShape[0] * deicticActionShape[1] *
                deicticActionShape[2]
            ]) == 1
            qCurr = getTabular(actionDescriptorsFlat)
        else:
            qCurr = getq(actionDescriptors)

        # select action
        qCurrNoise = qCurr + np.random.random(np.shape(
            qCurr)) * 0.01  # add small amount of noise to break ties randomly
        action = np.argmax(qCurrNoise[:, stateDeictic])
        if np.random.rand() < exploration.value(t):
            action = np.random.randint(num_actions)

        # take action
        new_obs, rew, done, _ = env.step(action)
        replay_buffer.add(stateDeictic, actionDescriptors[action, :], rew,
                          new_obs, float(done))

        if t > learning_starts and t % train_freq == 0:

            # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
            if prioritized_replay:
                beta = beta_schedule.value(t)
                states_t, actions, rewards, images_tp1, states_tp1, dones, weights, batch_idxes = replay_buffer.sample(
                    batch_size, beta)
            else:
                states_t, actions, rewards, images_tp1, states_tp1, dones = replay_buffer.sample(
                    batch_size)
                weights, batch_idxes = np.ones_like(rewards), None

            moveDescriptorsNext1 = getMoveActionDescriptors(images_tp1)
            actionsPickDescriptorsNext1 = np.concatenate([
                moveDescriptorsNext1,
                np.zeros(np.shape(moveDescriptorsNext1))
            ],
                                                         axis=3)
            actionsPlaceDescriptorsNext1 = np.concatenate([
                np.zeros(np.shape(moveDescriptorsNext1)), moveDescriptorsNext1
            ],
                                                          axis=3)
            actionDescriptorsNext1 = np.stack(
                [actionsPickDescriptorsNext1, actionsPlaceDescriptorsNext1],
                axis=0)
            actionDescriptorsNext1 = np.reshape(actionDescriptorsNext1, [
                batch_size * num_patches * num_actions_discrete,
                deicticActionShape[0], deicticActionShape[1],
                deicticActionShape[2]
            ])

            if valueFunctionType == "TABULAR":
                actionDescriptorsNextFlat1 = np.reshape(
                    actionDescriptorsNext1,
                    [batch_size * num_patches * num_actions_discrete, -1]) == 1
                qNextFlat1 = getTabular(actionDescriptorsNextFlat1)
            else:
                qNextFlat1 = getq(actionDescriptorsNext1)

            qNext1 = np.reshape(
                qNextFlat1,
                [batch_size, num_patches, num_actions_discrete, num_states])
            qNextmax1 = np.max(
                np.max(qNext1[range(batch_size), :, :, states_tp1], 2), 1)
            targets1 = rewards + (1 - dones) * gamma * qNextmax1

            if valueFunctionType == "TABULAR":
                actionsFlat = np.reshape(actions, [batch_size, -1]) == 1
                qCurrTarget1 = getTabular(actionsFlat)
            else:
                qCurrTarget1 = getq(actions)

            td_errors = qCurrTarget1[range(batch_size), states_t] - targets1
            qCurrTarget1[range(batch_size), states_t] = targets1

            if valueFunctionType == "TABULAR":
                trainTabular(actionsFlat, qCurrTarget1,
                             np.transpose(np.tile(
                                 weights, [num_states, 1])))  # (TABULAR)
            else:
                targetTrain(actions, qCurrTarget1,
                            np.transpose(np.tile(weights,
                                                 [num_states, 1])))  # (DQN)

            if prioritized_replay:
                new_priorities = np.abs(td_errors) + prioritized_replay_eps
                replay_buffer.update_priorities(batch_idxes, new_priorities)

        # bookkeeping for storing episode rewards
        episode_rewards[-1] += rew
        if done:
            new_obs = env.reset()
            episode_rewards.append(0.0)
        mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
        num_episodes = len(episode_rewards)
        if done and print_freq is not None and len(
                episode_rewards) % print_freq == 0:
            timerFinal = time.time()
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) +
                  ", mean 100 episode reward: " + str(mean_100ep_reward) +
                  ", % time spent exploring: " +
                  str(int(100 * exploration.value(t))) + ", beta: " +
                  str(beta) + ", time elapsed: " +
                  str(timerFinal - timerStart))
            timerStart = timerFinal

        obs = new_obs

    # display value function
    obs = env.reset()
    moveDescriptors = getMoveActionDescriptors([obs[0]])
    actionsPickDescriptors = np.concatenate(
        [moveDescriptors, np.zeros(np.shape(moveDescriptors))], axis=3)
    actionsPlaceDescriptors = np.concatenate(
        [np.zeros(np.shape(moveDescriptors)), moveDescriptors], axis=3)

    print(str(obs[0][:, :, 0]))

    qPick = getq(actionsPickDescriptors)
    #    qPick = getTabular(np.reshape(actionsPickDescriptors,[num_patches,-1])==1)
    print("Value function for pick action in hold-nothing state:")
    print(str(np.reshape(qPick[:, 0], [8, 8])))

    qPlace = getq(actionsPlaceDescriptors)
    #    qPlace = getTabular(np.reshape(actionsPlaceDescriptors,[num_patches,-1])==1)
    print("Value function for place action in hold-1 state:")
    print(str(np.reshape(qPlace[:, 1], [8, 8])))

    print("Value function for place action in hold-2 state:")
    print(str(np.reshape(qPlace[:, 2], [8, 8])))

Esempio n. 2

Mostra file

File: puckarrange1.py Progetto: rplatt/generic_text_DEICTIC

def main(envStride, fileIn, fileOut, inputmaxtimesteps):

    reuseModels = None
    
    np.set_printoptions(formatter={'float_kind':lambda x: "%.2f" % x})

    env = envstandalone.PuckArrange()
    env.stride = envStride # stride input to this problem
    env.reset() # need to do the reset her in order to populate parameters
    
    # Standard q-learning parameters
#    max_timesteps=2000
    max_timesteps=inputmaxtimesteps
    exploration_fraction=0.3
    exploration_final_eps=0.1
    gamma=.90
    num_cpu = 16

    # Used by buffering and DQN
    learning_starts=60
    buffer_size=1000
    batch_size=10
    target_network_update_freq=1
    train_freq=1
    print_freq=1
    lr=0.0003

    # first two elts of deicticShape must be odd
    descriptorShape = (env.blockSize*3,env.blockSize*3,2)
#    descriptorShapeSmall = (10,10,2)
#    descriptorShapeSmall = (15,15,2)
    descriptorShapeSmall = (20,20,2)
    num_states = 2 # either holding or not
    num_patches = len(env.moveCenters)**2
    num_actions = 2*num_patches
    num_actions_discrete = 2
#    valueFunctionType = "TABULAR"
    valueFunctionType = "DQN"
#    actionSelectionStrategy = "UNIFORM_RANDOM" # actions are selected randomly from collection of all actions
    actionSelectionStrategy = "RANDOM_UNIQUE" # each unique action descriptor has equal chance of being selected

    episode_rewards = [0.0]
    
    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

#    prioritized_replay=True
    prioritized_replay=False
#    prioritized_replay_alpha=1.0
    prioritized_replay_alpha=0.6
    prioritized_replay_beta0=0.4
    prioritized_replay_beta_iters=None
#    prioritized_replay_beta_iters=20000
    prioritized_replay_eps=1e-6
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None
    beta = 1

    q_func = models.cnn_to_mlp(
#    q_func = models.cnn_to_mlp_2pathways(
#        convs=[(16,3,1), (32,3,1)],
#        hiddens=[48],
        convs=[(16,3,1)],
        hiddens=[32],
#        convs=[(32,3,1)],
#        hiddens=[48],
#        convs=[(48,3,1)],
#        hiddens=[48],
        dueling=True
    )

    def make_obs_ph(name):
        return U.BatchInput(env.observation_space.spaces[0].shape, name=name)

    def make_actionDeic_ph(name):
        return U.BatchInput(descriptorShapeSmall, name=name)

    def make_target_ph(name):
        return U.BatchInput([1], name=name)

    def make_weight_ph(name):
        return U.BatchInput([1], name=name)

    getMoveActionDescriptors = build_getMoveActionDescriptors(make_obs_ph=make_obs_ph,actionShape=descriptorShape,actionShapeSmall=descriptorShapeSmall,stride=env.stride)
    
    if valueFunctionType == 'DQN':
        getqNotHolding = build_getq(
                make_actionDeic_ph=make_actionDeic_ph,
                q_func=q_func,
                num_states=num_states,
                num_cascade=5,
                scope="deepq",
                qscope="q_func_notholding",
                reuse=reuseModels
                )
        getqHolding = build_getq(
                make_actionDeic_ph=make_actionDeic_ph,
                q_func=q_func,
                num_states=num_states,
                num_cascade=5,
                scope="deepq",
                qscope="q_func_holding",
                reuse=reuseModels
                )
    
        targetTrainNotHolding = build_targetTrain(
            make_actionDeic_ph=make_actionDeic_ph,
            make_target_ph=make_target_ph,
            make_weight_ph=make_weight_ph,
            q_func=q_func,
            num_states=num_states,
            num_cascade=5,
            optimizer=tf.train.AdamOptimizer(learning_rate=lr),
            scope="deepq", 
            qscope="q_func_notholding",
            grad_norm_clipping=1.,
            reuse=reuseModels
        )

        targetTrainHolding = build_targetTrain(
            make_actionDeic_ph=make_actionDeic_ph,
            make_target_ph=make_target_ph,
            make_weight_ph=make_weight_ph,
            q_func=q_func,
            num_states=num_states,
            num_cascade=5,
            optimizer=tf.train.AdamOptimizer(learning_rate=lr),
            scope="deepq", 
            qscope="q_func_holding",
            grad_norm_clipping=1.,
            reuse=reuseModels
        )
        
    sess = U.make_session(num_cpu)
    sess.__enter__()

    obs = env.reset()

    episode_rewards = [0.0]
    td_errors = [0.0]
    timerStart = time.time()
    U.initialize()
    
    # load prior model
    if fileIn != "None":
        saver = tf.train.Saver()
        saver.restore(sess, fileIn)

    for t in range(max_timesteps):
        
        # Get action set: <num_patches> pick actions followed by <num_patches> place actions
        moveDescriptors = getMoveActionDescriptors([obs[0]])
        moveDescriptors = moveDescriptors*2-1
        actionsPickDescriptors = np.stack([moveDescriptors, np.zeros(np.shape(moveDescriptors))],axis=3)
        actionsPlaceDescriptors = np.stack([np.zeros(np.shape(moveDescriptors)),moveDescriptors],axis=3)
        actionDescriptors = np.r_[actionsPickDescriptors,actionsPlaceDescriptors]

        qCurrNotHolding = getqNotHolding(actionDescriptors)
        qCurrHolding = getqHolding(actionDescriptors)
        qCurr = np.concatenate([qCurrNotHolding,qCurrHolding],axis=1)

        # select action at random
        qCurrNoise = qCurr + np.random.random(np.shape(qCurr))*0.01 # add small amount of noise to break ties randomly
        if actionSelectionStrategy == "UNIFORM_RANDOM":
            action = np.argmax(qCurrNoise[:,obs[1]])
            if np.random.rand() < exploration.value(t):
                action = np.random.randint(num_actions)
        elif actionSelectionStrategy == "RANDOM_UNIQUE":
            _,idx,inv = np.unique(actionDescriptors,axis=0,return_index=True,return_inverse=True)
            actionIdx = np.argmax(qCurrNoise[idx,obs[1]])
            if np.random.rand() < exploration.value(t):
                actionIdx = np.random.randint(len(idx))
            actionsSelected = np.nonzero(inv==actionIdx)[0]
            action = actionsSelected[np.random.randint(len(actionsSelected))]
        else:
            print("Error...")

        # take action
        new_obs, rew, done, _ = env.step(action)
        
        replay_buffer.add(obs[1], actionDescriptors[action,:], rew, np.copy(new_obs), float(done))

        if t > learning_starts and t % train_freq == 0:

            # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
            if prioritized_replay:
                beta=beta_schedule.value(t)
                states_t, actionPatches, rewards, images_tp1, states_tp1, dones, weights, batch_idxes = replay_buffer.sample(batch_size, beta)
            else:
                states_t, actionPatches, rewards, images_tp1, states_tp1, dones = replay_buffer.sample(batch_size)
                weights, batch_idxes = np.ones_like(rewards), None

            moveDescriptorsNext = getMoveActionDescriptors(images_tp1)
            moveDescriptorsNext = moveDescriptorsNext*2-1

            actionsPickDescriptorsNext = np.stack([moveDescriptorsNext, np.zeros(np.shape(moveDescriptorsNext))],axis=3)
            actionsPlaceDescriptorsNext = np.stack([np.zeros(np.shape(moveDescriptorsNext)), moveDescriptorsNext],axis=3)
            actionDescriptorsNext = np.stack([actionsPickDescriptorsNext, actionsPlaceDescriptorsNext], axis=1) # I sometimes get this axis parameter wrong... pay attention!
            actionDescriptorsNext = np.reshape(actionDescriptorsNext,[-1,descriptorShapeSmall[0],descriptorShapeSmall[1],descriptorShapeSmall[2]])

            qNextNotHolding = getqNotHolding(actionDescriptorsNext)
            qNextHolding = getqHolding(actionDescriptorsNext)
            qNextFlat = np.concatenate([qNextNotHolding,qNextHolding],axis=1)

            qNext = np.reshape(qNextFlat,[batch_size,num_patches,num_actions_discrete,num_states])
            qNextmax = np.max(np.max(qNext[range(batch_size),:,:,states_tp1],2),1)
            
            targets = rewards + (1-dones) * gamma * qNextmax
            
            qCurrTargetNotHolding = getqNotHolding(actionPatches)
            qCurrTargetHolding = getqHolding(actionPatches)
            qCurrTarget = np.concatenate([qCurrTargetNotHolding,qCurrTargetHolding],axis=1)

            td_error = qCurrTarget[range(batch_size),states_t] - targets
            qCurrTarget[range(batch_size),states_t] = targets

            targetTrainNotHolding(actionPatches, np.reshape(qCurrTarget[:,0],[batch_size,1]), np.reshape(weights,[batch_size,1]))
            targetTrainHolding(actionPatches, np.reshape(qCurrTarget[:,1],[batch_size,1]), np.reshape(weights,[batch_size,1]))

            if prioritized_replay:
                new_priorities = np.abs(td_error) + prioritized_replay_eps
                replay_buffer.update_priorities(batch_idxes, new_priorities)

            td_errors[-1] += td_error


        # bookkeeping for storing episode rewards
        episode_rewards[-1] += rew
        if done:
            new_obs = env.reset()
            episode_rewards.append(0.0)
            td_errors.append(0.0)
        mean_100ep_reward = round(np.mean(episode_rewards[-51:-1]), 1)
#        mean_100ep_tderror = round(np.mean(td_errors[-51:-1]), 1)
        num_episodes = len(episode_rewards)
        if done and print_freq is not None and len(episode_rewards) % print_freq == 0:
            timerFinal = time.time()
#            print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart) + ", tderror: " + str(mean_100ep_tderror))
            print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t))) + ", time elapsed: " + str(timerFinal - timerStart))
            timerStart = timerFinal
        
        obs = np.copy(new_obs)

    # save what we learned
    if fileOut != "None":
        saver = tf.train.Saver()
        saver.save(sess, fileOut)

    # display value function
    obs = env.reset()
    moveDescriptors = getMoveActionDescriptors([obs[0]])
    moveDescriptors = moveDescriptors*2-1
    gridSize = np.int32(np.sqrt(np.shape(moveDescriptors)[0]))

    actionsPickDescriptors = np.stack([moveDescriptors, np.zeros(np.shape(moveDescriptors))],axis=3)
    actionsPlaceDescriptors = np.stack([np.zeros(np.shape(moveDescriptors)), moveDescriptors],axis=3)
    
    print(str(obs[0][:,:,0]))
    
    qPickNotHolding = getqNotHolding(actionsPickDescriptors)
    qPickHolding = getqHolding(actionsPickDescriptors)
    qPick = np.concatenate([qPickNotHolding,qPickHolding],axis=1)
    print("Value function for pick action in hold-nothing state:")
    print(str(np.reshape(qPick[:,0],[gridSize,gridSize])))
    print("Value function for pick action in hold-1 state:")
    print(str(np.reshape(qPick[:,1],[gridSize,gridSize])))

    qPlaceNotHolding = getqNotHolding(actionsPlaceDescriptors)
    qPlaceHolding = getqHolding(actionsPlaceDescriptors)
    qPlace = np.concatenate([qPlaceNotHolding,qPlaceHolding],axis=1)
    print("Value function for place action in hold-nothing state:")
    print(str(np.reshape(qPlace[:,0],[gridSize,gridSize])))
    print("Value function for place action in hold-1 state:")
    print(str(np.reshape(qPlace[:,1],[gridSize,gridSize])))
    
    plt.subplot(1,3,1)
    plt.imshow(np.tile(env.state[0],[1,1,3]))
    plt.subplot(1,3,2)
    plt.imshow(np.reshape(qPick[:,0],[gridSize,gridSize]))
    plt.subplot(1,3,3)
    plt.imshow(np.reshape(qPlace[:,1],[gridSize,gridSize]))
    plt.show()