def build_targetTrain_DQN(make_obs_ph,
                          make_target_ph,
                          q_func,
                          num_actions,
                          optimizer,
                          scope="deepq",
                          reuse=None):

    with tf.variable_scope(scope, reuse=reuse):

        # set up placeholders
        obs_t_input = U.ensure_tf_input(make_obs_ph("obs_t"))
        target_input = U.ensure_tf_input(make_target_ph("target"))

        # get variables
        q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))

        # q values for all actions
        q_t_raw = q_func(obs_t_input.get(),
                         num_actions,
                         scope="q_func",
                         reuse=True)

        # calculate error
        td_error = q_t_raw - tf.stop_gradient(target_input.get())
        errors = U.huber_loss(td_error)

        optimize_expr = optimizer.minimize(errors, var_list=q_func_vars)

        targetTrain = U.function(inputs=[obs_t_input, target_input],
                                 outputs=[td_error],
                                 updates=[optimize_expr])

        return targetTrain
Example #2
0
def build_targetTrain(make_actionDeic_ph,
                        make_target_ph,
                        make_weight_ph,
                        q_func,
                        num_states,
                        num_cascade,
                        optimizer,
                        scope="deepq", 
                        qscope="q_func",
                        grad_norm_clipping=None,
                        reuse=None):

    with tf.variable_scope(scope, reuse=reuse):
        
        # set up placeholders
        obs_t_input = U.ensure_tf_input(make_actionDeic_ph("action_t_deic"))
        target_input = U.ensure_tf_input(make_target_ph("target"))
        importance_weights_ph = U.ensure_tf_input(make_weight_ph("target"))
    
        # get variables
        q_func_vars = U.scope_vars(U.absolute_scope_name(qscope))
    
        # q values for all actions
#        q_t_raw = q_func(obs_t_input.get(), num_states*num_cascade, scope=qscope, reuse=True)
#        targetTiled = tf.reshape(target_input.get(), shape=(-1,num_cascade*num_states))
#        q_t_raw = q_func(obs_t_input.get(), num_states, scope=qscope, reuse=True)
#        targetTiled = tf.reshape(target_input.get(), shape=(-1,num_states))
        q_t_raw = q_func(obs_t_input.get(), 1, scope=qscope, reuse=True)
        targetTiled = tf.reshape(target_input.get(), shape=(-1,1))
        
        # calculate error
        td_error = q_t_raw - tf.stop_gradient(targetTiled)
        errors = importance_weights_ph.get() * U.huber_loss(td_error)

        # compute optimization op (potentially with gradient clipping)
        if grad_norm_clipping is not None:
            optimize_expr = U.minimize_and_clip(optimizer,
                                                errors,
                                                var_list=q_func_vars,
                                                clip_val=grad_norm_clipping)
        else:
            optimize_expr = optimizer.minimize(errors, var_list=q_func_vars)
    
#        optimize_expr = optimizer.minimize(errors, var_list=q_func_vars)
        
        targetTrain = U.function(
            inputs=[
                obs_t_input,
                target_input,
                importance_weights_ph
            ],
            outputs=[td_error, obs_t_input.get(), target_input.get()],
            updates=[optimize_expr]
        )
    
        return targetTrain
Example #3
0
def build_getq(make_deic_ph, q_func, scope="deepq", qscope="q_func", reuse=None):

    with tf.variable_scope(scope, reuse=reuse):
        actions_ph = U.ensure_tf_input(make_deic_ph("stateaction"))
        q_values = q_func(actions_ph.get(), 1, scope=qscope)
        getq = U.function(inputs=[actions_ph], outputs=q_values)
        return getq
Example #4
0
def build_getq_fullstate(make_fullImage_ph, q_func, num_actions, num_cascade, scope="deepq", qscope="q_func", reuse=None):

    with tf.variable_scope(scope, reuse=reuse):
        state_ph = U.ensure_tf_input(make_fullImage_ph("state"))
        q_values = q_func(state_ph.get(), num_actions, scope=qscope)
        getq = U.function(inputs=[state_ph], outputs=q_values)
        return getq
Example #5
0
def build_getDeic_FocCoarse(make_obs_ph,deicticShape):
    
    observations_ph = U.ensure_tf_input(make_obs_ph("observation"))

    # create padded image
    obs = observations_ph.get()
    shape = tf.shape(obs)
    obsZeroPadded = tf.image.resize_image_with_crop_or_pad(obs,shape[1]+2*deicticShape[0],shape[2]+2*deicticShape[0])

    # extract large patches from padded image
    patchesLarge = tf.extract_image_patches(
            obsZeroPadded,
            ksizes=[1, 3*deicticShape[0], 3*deicticShape[1], 1], 
            strides=[1, 1, 1, 1], 
            rates=[1, 1, 1, 1], 
            padding='VALID')

    # reshape into focused and large images
    patchesShape = tf.shape(patchesLarge)
    patchesTiledLarge = tf.reshape(patchesLarge,[patchesShape[0]*patchesShape[1]*patchesShape[2],3*deicticShape[0],3*deicticShape[1],1])
    patchesTiledFocused = patchesTiledLarge[:,deicticShape[0]:2*deicticShape[0],deicticShape[1]:2*deicticShape[1],0]

    # get two coarse images: one for agent and one for the ghost (might make this more efficient by doing the resize only once...)
    coarseAgent = tf.image.resize_area(tf.cast(tf.equal(patchesTiledLarge,1.),tf.int32), deicticShape[0:2])[:,:,:,0] > 0
    coarseGhost = tf.image.resize_area(tf.cast(tf.equal(patchesTiledLarge,2.),tf.int32), deicticShape[0:2])[:,:,:,0] > 0

    patchesTiledStacked = tf.stack([tf.equal(patchesTiledFocused,1), tf.equal(patchesTiledFocused,2), coarseAgent, coarseGhost],axis=-1)
    
    getDeic = U.function(inputs=[observations_ph], outputs=patchesTiledStacked)
    return getDeic
Example #6
0
def build_getMoveActionDescriptors(make_obs_ph,deicticShape):
    
    if (deicticShape[0] % 2 == 0) or (deicticShape[1] % 2 == 0):
        print("build_getActionDescriptors ERROR: first two elts of deicticShape must by odd")
        
    observations_ph = U.ensure_tf_input(make_obs_ph("observation"))
    obs = observations_ph.get()
    shape = tf.shape(obs)
    deicticPad = np.floor(np.array(deicticShape)-1)
    obsZeroPadded = tf.image.resize_image_with_crop_or_pad(obs,shape[1]+deicticPad[0],shape[2]+deicticPad[1])
    patches = tf.extract_image_patches(
#            observations_ph.get(),
            obsZeroPadded,
            ksizes=[1, deicticShape[0], deicticShape[1], 1], 
            strides=[1, 1, 1, 1], 
            rates=[1, 1, 1, 1], 
            padding='VALID')
    patchesShape = tf.shape(patches)
    patchesTiled = tf.reshape(patches,[patchesShape[0]*patchesShape[1]*patchesShape[2],deicticShape[0],deicticShape[1]])
#    patchesTiledStacked = tf.stack([tf.equal(patchesTiled,1), tf.equal(patchesTiled,2)],axis=-1)
#    X,Y = tf.meshgrid(tf.range(shape[1]),tf.range(shape[2]))
#    moveActions = tf.stack([tf.reshape(Y,[shape[1]*shape[2],]), tf.reshape(X,[shape[1]*shape[2],])],axis=0)
#    getMoveActionDescriptors = U.function(inputs=[observations_ph], outputs=patchesTiledStacked)
    getMoveActionDescriptors = U.function(inputs=[observations_ph], outputs=patchesTiled)
    return getMoveActionDescriptors
Example #7
0
def build_getMoveActionDescriptors(make_obs_ph, actionShape, actionShapeSmall,
                                   stride):

    observations_ph = U.ensure_tf_input(make_obs_ph("observation"))
    obs = observations_ph.get()
    shape = tf.shape(obs)
    deicticPad = np.int32(2 * np.floor(np.array(actionShape) / 3))
    obsZeroPadded = tf.image.resize_image_with_crop_or_pad(
        obs, shape[1] + deicticPad[0], shape[2] + deicticPad[1])
    patches = tf.extract_image_patches(
        obsZeroPadded,
        #            obs,
        ksizes=[1, actionShape[0], actionShape[1], 1],
        #            strides=[1, deicticPad[0]/2, deicticPad[1]/2, 1],
        strides=[1, stride, stride, 1],
        rates=[1, 1, 1, 1],
        padding='VALID')
    patchesShape = tf.shape(patches)
    patchesTiled = tf.reshape(patches, [
        patchesShape[0] * patchesShape[1] * patchesShape[2], actionShape[0],
        actionShape[1], 1
    ])
    patchesTiledSmall = tf.image.resize_images(
        patchesTiled, [actionShapeSmall[0], actionShapeSmall[1]])
    patchesTiledSmall = tf.reshape(
        patchesTiledSmall, [-1, actionShapeSmall[0], actionShapeSmall[1]])

    getMoveActionDescriptors = U.function(inputs=[observations_ph],
                                          outputs=patchesTiledSmall)
    return getMoveActionDescriptors
Example #8
0
def build_getMoveActionDescriptorsRot(make_obs_ph,actionShape,actionShapeSmall,stride):
    
    observations_ph = U.ensure_tf_input(make_obs_ph("observation"))
    obs = observations_ph.get()
    shape = tf.shape(obs)
    deicticPad = np.int32(2*np.floor(np.array(actionShape)/3))
    obsZeroPadded = tf.image.resize_image_with_crop_or_pad(obs,shape[1]+deicticPad[0],shape[2]+deicticPad[1])
    patches = tf.extract_image_patches(
            obsZeroPadded,
            ksizes=[1, actionShape[0], actionShape[1], 1],
            strides=[1, stride, stride, 1], 
            rates=[1, 1, 1, 1], 
            padding='VALID')
    patchesShape = tf.shape(patches)
    patchesTiled = tf.reshape(patches,[patchesShape[0]*patchesShape[1]*patchesShape[2],actionShape[0],actionShape[1],1])
    
    patchesTiledRot0 = patchesTiled
    patchesTiledRot1 = tf.contrib.image.rotate(patchesTiled,np.pi/4)
    patchesTiledRot2 = tf.contrib.image.rotate(patchesTiled,2*np.pi/4)
    patchesTiledRot3 = tf.contrib.image.rotate(patchesTiled,3*np.pi/4)

#    patchesTiledAll = tf.concat([patchesTiled,patchesTiled,patchesTiled,patchesTiled],axis=0)
#    patchesTiledAll = tf.concat([patchesTiledRot0,patchesTiledRot0,patchesTiledRot2,patchesTiledRot2],axis=0)
#    patchesTiledAll = tf.concat([patchesTiledRot1,patchesTiledRot1,patchesTiledRot3,patchesTiledRot3],axis=0)
    patchesTiledAll = tf.concat([patchesTiledRot0,patchesTiledRot1,patchesTiledRot2,patchesTiledRot3],axis=0)
    
    patchesTiledSmall = tf.image.resize_images(patchesTiledAll, [actionShapeSmall[0], actionShapeSmall[1]])
    patchesTiledSmall = tf.reshape(patchesTiledSmall,[-1,actionShapeSmall[0],actionShapeSmall[1]])

    getMoveActionDescriptors = U.function(inputs=[observations_ph], outputs=patchesTiledSmall)
    return getMoveActionDescriptors
def build_train_cascaded(make_obs_ph,
                         make_target_ph,
                         q_func,
                         num_cascade,
                         num_actions,
                         optimizer,
                         grad_norm_clipping=None,
                         double_q=True,
                         scope="deepq",
                         reuse=None):

    getq_f = build_getq(make_obs_ph,
                        q_func,
                        num_actions,
                        num_cascade,
                        scope=scope,
                        reuse=reuse)

    with tf.variable_scope(scope, reuse=reuse):

        # set up placeholders
        obs_t_input = U.ensure_tf_input(make_obs_ph("obs_t"))
        target_input = U.ensure_tf_input(make_target_ph("target"))

        # get variables
        q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))

        # q values for all actions
        q_t_raw = q_func(obs_t_input.get(),
                         num_actions * num_cascade,
                         scope="q_func",
                         reuse=True)
        q_t = tf.reshape(q_t_raw, shape=(-1, num_cascade, num_actions))

        # calculate error
        td_error = q_t - tf.stop_gradient(target_input.get())
        errors = U.huber_loss(td_error)

        optimize_expr = optimizer.minimize(errors, var_list=q_func_vars)

        targetTrain = U.function(
            inputs=[obs_t_input, target_input],
            outputs=[td_error, obs_t_input.get(),
                     target_input.get()],
            updates=[optimize_expr])

    return getq_f, targetTrain
def build_get_2channelobs(make_obs_ph):
    observations_ph = U.ensure_tf_input(make_obs_ph("observation"))
    patchesTiledStacked = tf.stack([
        tf.equal(observations_ph.get()[:, :, :, 0], 1),
        tf.equal(observations_ph.get()[:, :, :, 0], 2)
    ],
                                   axis=-1)
    getDeic = U.function(inputs=[observations_ph], outputs=patchesTiledStacked)
    return getDeic
Example #11
0
def build_getq(make_obsDeic_ph, q_func, num_actions, num_cascade, scope="deepq", qscope="q_func", reuse=None):

    with tf.variable_scope(scope, reuse=reuse):

        observations_ph = U.ensure_tf_input(make_obsDeic_ph("observation"))
        q_values = q_func(observations_ph.get(), num_actions*num_cascade, scope=qscope)
        q_valuesTiled = tf.reshape(q_values,[-1,num_cascade,num_actions])
        getq = U.function(inputs=[observations_ph], outputs=q_valuesTiled)
        return getq
def build_getMoveActionDescriptorsRot(make_obs_ph, actionShape,
                                      actionShapeSmall, stride,
                                      numOrientations):

    observations_ph = U.ensure_tf_input(make_obs_ph("observation"))
    obs = observations_ph.get()
    shape = tf.shape(obs)
    deicticPad = np.int32(2 * np.floor(np.array(actionShape) / 3))
    obsZeroPadded = tf.image.resize_image_with_crop_or_pad(
        obs, shape[1] + deicticPad[0], shape[2] + deicticPad[1])
    patches = tf.extract_image_patches(
        obsZeroPadded,
        ksizes=[1, actionShape[0], actionShape[1], 1],
        strides=[1, stride, stride, 1],
        rates=[1, 1, 1, 1],
        padding='VALID')
    patchesShape = tf.shape(patches)
    patchesTiled = tf.reshape(patches, [
        patchesShape[0] * patchesShape[1] * patchesShape[2], actionShape[0],
        actionShape[1], 1
    ])

    patchesTiledRot0 = patchesTiled
    patchesTiledRot1 = tf.contrib.image.rotate(patchesTiled, np.pi / 8)
    patchesTiledRot2 = tf.contrib.image.rotate(patchesTiled, 2 * np.pi / 8)
    patchesTiledRot3 = tf.contrib.image.rotate(patchesTiled, 3 * np.pi / 8)
    patchesTiledRot4 = tf.contrib.image.rotate(patchesTiled, 4 * np.pi / 8)
    patchesTiledRot5 = tf.contrib.image.rotate(patchesTiled, 5 * np.pi / 8)
    patchesTiledRot6 = tf.contrib.image.rotate(patchesTiled, 6 * np.pi / 8)
    patchesTiledRot7 = tf.contrib.image.rotate(patchesTiled, 7 * np.pi / 8)

    if numOrientations == 4:
        patchesTiledAll = tf.concat([
            patchesTiledRot0, patchesTiledRot0, patchesTiledRot2,
            patchesTiledRot2, patchesTiledRot4, patchesTiledRot4,
            patchesTiledRot6, patchesTiledRot6
        ],
                                    axis=0)
    elif numOrientations == 8:
        patchesTiledAll = tf.concat([
            patchesTiledRot0, patchesTiledRot1, patchesTiledRot2,
            patchesTiledRot3, patchesTiledRot4, patchesTiledRot5,
            patchesTiledRot6, patchesTiledRot7
        ],
                                    axis=0)
    else:
        print('ERROR: invalid number of orientations')

    patchesTiledSmall = tf.image.resize_images(
        patchesTiledAll, [actionShapeSmall[0], actionShapeSmall[1]])
    patchesTiledSmall = tf.reshape(
        patchesTiledSmall, [-1, actionShapeSmall[0], actionShapeSmall[1]])

    getMoveActionDescriptors = U.function(inputs=[observations_ph],
                                          outputs=patchesTiledSmall)
    return getMoveActionDescriptors
def build_getq_DQN(make_obs_ph,
                   q_func,
                   num_actions,
                   scope="deepq",
                   reuse=None):
    with tf.variable_scope(scope, reuse=reuse):
        observations_ph = U.ensure_tf_input(make_obs_ph("observation"))
        q_values = q_func(observations_ph.get(), num_actions, scope="q_func")
        getq = U.function(inputs=[observations_ph], outputs=q_values)
        return getq
Example #14
0
def build_getq(make_actionDeic_ph, q_func, num_states, num_cascade, scope="deepq", qscope="q_func", reuse=None):

    with tf.variable_scope(scope, reuse=reuse):

        actions_ph = U.ensure_tf_input(make_actionDeic_ph("actions"))
#        q_values = q_func(actions_ph.get(), num_states*num_cascade, scope=qscope)
#        q_valuesTiled = tf.reshape(q_values,[-1,num_cascade,num_states])
        q_values = q_func(actions_ph.get(), num_states, scope=qscope)
        q_valuesTiled = tf.reshape(q_values,[-1,num_states])
        getq = U.function(inputs=[actions_ph], outputs=q_valuesTiled)
        return getq
Example #15
0
def build_getDeic_Foc(make_obs_ph,deicticShape):
    observations_ph = U.ensure_tf_input(make_obs_ph("observation"))
    patches = tf.extract_image_patches(
            observations_ph.get(),
            ksizes=[1, deicticShape[0], deicticShape[1], 1], 
            strides=[1, 1, 1, 1], 
            rates=[1, 1, 1, 1], 
            padding='VALID')
    patchesShape = tf.shape(patches)
    patchesTiled = tf.reshape(patches,[patchesShape[0]*patchesShape[1]*patchesShape[2],deicticShape[0],deicticShape[1]])
    patchesTiledStacked = tf.stack([tf.equal(patchesTiled,1), tf.equal(patchesTiled,2)],axis=-1)
    getDeic = U.function(inputs=[observations_ph], outputs=patchesTiledStacked)
    return getDeic
Example #16
0
def build_getMoveActionDescriptorsRot(make_obs_ph, patchSize, handSize,
                                      smallSize, stride):

    observations_ph = U.ensure_tf_input(make_obs_ph("observation"))
    obs = observations_ph.get()
    origImShape = tf.shape(obs)
    patchExpanded = np.int32(patchSize * np.sqrt(2)) + 1
    amt2Pad = patchExpanded - handSize
    obsZeroPadded = tf.image.resize_image_with_crop_or_pad(
        obs, origImShape[1] + amt2Pad, origImShape[2] + amt2Pad)
    patches = tf.extract_image_patches(
        obsZeroPadded,
        ksizes=[1, patchExpanded, patchExpanded, 1],
        strides=[1, stride, stride, 1],
        rates=[1, 1, 1, 1],
        padding='VALID')
    patchesShape = tf.shape(patches)
    patchesTiled = tf.reshape(patches, [
        patchesShape[0] * patchesShape[1] * patchesShape[2], patchExpanded,
        patchExpanded, 1
    ])
    patchesTiledRot0 = tf.contrib.image.rotate(patchesTiled, 0)
    patchesTiledRot1 = tf.contrib.image.rotate(patchesTiled, np.pi / 4)
    patchesTiledRot2 = tf.contrib.image.rotate(patchesTiled, 2 * np.pi / 4)
    patchesTiledRot3 = tf.contrib.image.rotate(patchesTiled, 3 * np.pi / 4)
    #    patchesTiledAll = tf.concat([patchesTiledRot0,patchesTiledRot1],axis=0)
    patchesTiledAll = tf.concat([
        patchesTiledRot0, patchesTiledRot1, patchesTiledRot2, patchesTiledRot3
    ],
                                axis=0)
    #    patchesTiledAll = tf.concat([patchesTiledRot1,patchesTiledRot1,patchesTiledRot3,patchesTiledRot3],axis=0)
    #    patchesTiledAll = tf.concat([patchesTiledRot0,patchesTiledRot0,patchesTiledRot2,patchesTiledRot2],axis=0)
    #    patchesTiledAll = patchesTiled

    patchesTiledRotCrop = tf.image.resize_image_with_crop_or_pad(
        patchesTiledAll, patchSize, patchSize)
    patchesTiledSmall = tf.image.resize_images(patchesTiledRotCrop,
                                               [smallSize, smallSize])
    patchesTiledSmall = tf.reshape(patchesTiledSmall,
                                   [-1, smallSize, smallSize])

    #    getMoveActionDescriptors = U.function(inputs=[observations_ph], outputs=[obsZeroPadded, patches, patchesTiledSmall])
    getMoveActionDescriptors = U.function(inputs=[observations_ph],
                                          outputs=patchesTiledSmall)
    return getMoveActionDescriptors
Example #17
0
def build_getDeic(make_obs_ph, deicticShape):
    observations_ph = U.ensure_tf_input(make_obs_ph("observation"))
    patches = tf.extract_image_patches(
        observations_ph.get(),
        ksizes=[1, deicticShape[0], deicticShape[1], 1],
        strides=[1, 1, 1, 1],
        rates=[1, 1, 1, 1],
        padding='VALID')
    patchesShape = tf.shape(patches)
    patchesTiled = tf.reshape(patches, [
        patchesShape[0] * patchesShape[1] * patchesShape[2], deicticShape[0],
        deicticShape[1]
    ])

    coarse = tf.image.resize_area(observations_ph.get(), (4, 4)) * 4

    coarseTiled = tf.transpose(
        tf.tile([coarse], [patchesShape[1] * patchesShape[2], 1, 1, 1, 1]),
        [1, 0, 2, 3, 4])

    coarseTiledReshape = tf.reshape(coarseTiled, [
        patchesShape[0] * patchesShape[1] * patchesShape[2], deicticShape[0],
        deicticShape[1]
    ])

    #    patchesTiledStacked = tf.stack([tf.cast(tf.equal(patchesTiled,1),tf.float32), tf.cast(tf.equal(patchesTiled,2),tf.float32)],axis=-1)
    patchesTiledStacked = tf.stack([
        tf.cast(tf.equal(patchesTiled, 1), tf.float32),
        tf.cast(tf.equal(patchesTiled, 2), tf.float32), coarseTiledReshape
    ],
                                   axis=-1)

    #    getDeic = U.function(inputs=[observations_ph], outputs=[patchesTiledStacked, coarseTiledReshape, patchesTiled, patchesTiledStacked2])
    #    getDeic = U.function(inputs=[observations_ph], outputs=[patchesTiledStacked, patchesTiledStacked2])
    getDeic = U.function(inputs=[observations_ph], outputs=patchesTiledStacked)

    return getDeic
def build_train_deictic(make_obs_ph,
                        q_func,
                        num_actions,
                        optimizer,
                        grad_norm_clipping=None,
                        gamma=1.0,
                        double_q=True,
                        scope="deepq",
                        reuse=None):

    #    act_f = build_act(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse)
    getq_f = build_getq(make_obs_ph,
                        q_func,
                        num_actions,
                        scope=scope,
                        reuse=reuse)

    with tf.variable_scope(scope, reuse=reuse):
        # set up placeholders
        obs_t_input = U.ensure_tf_input(make_obs_ph("obs_t"))
        act_t_ph = tf.placeholder(tf.int32, [None], name="action")
        rew_t_ph = tf.placeholder(tf.float32, [None], name="reward")
        obs_tp1_input = U.ensure_tf_input(make_obs_ph("obs_tp1"))
        done_mask_ph = tf.placeholder(tf.float32, [None], name="done")
        importance_weights_ph = tf.placeholder(tf.float32, [None],
                                               name="weight")

        # q network evaluation
        q_t = q_func(obs_t_input.get(),
                     num_actions,
                     scope="q_func",
                     reuse=True)  # reuse parameters from act
        q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))

        # target q network evalution
        q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func")
        target_q_func_vars = U.scope_vars(
            U.absolute_scope_name("target_q_func"))

        # q scores for actions which we know were selected in the given state.
        q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions),
                                     1)

        # compute estimate of best possible value starting from state at t + 1
        if double_q:
            q_tp1_using_online_net = q_func(obs_tp1_input.get(),
                                            num_actions,
                                            scope="q_func",
                                            reuse=True)
            q_tp1_best_using_online_net = tf.arg_max(q_tp1_using_online_net, 1)
            q_tp1_best = tf.reduce_sum(
                q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions),
                1)
        else:
            q_tp1_best = tf.reduce_max(q_tp1, 1)

        q_tp1_best = tf.reshape(q_tp1_best, [32, 25])
        q_tp1_best_reduced = tf.reduce_max(q_tp1_best, 1)

        # compute RHS of bellman equation
        q_t_selected_target = rew_t_ph + gamma * q_tp1_best_reduced
        q_t_selected_target_masked = (1.0 - done_mask_ph) * q_t_selected_target

        q_t_selected_target_tiled = tf.tile(
            tf.reshape(q_t_selected_target_masked, [32, 1]), [1, 25])
        q_t_selected_target_expanded = tf.reshape(q_t_selected_target_tiled, [
            800,
        ])

        # compute the error (potentially clipped)
        td_error = q_t_selected - tf.stop_gradient(
            q_t_selected_target_expanded)
        errors = U.huber_loss(td_error)
        weighted_error = tf.reduce_mean(importance_weights_ph * errors)

        # compute optimization op (potentially with gradient clipping)
        if grad_norm_clipping is not None:
            optimize_expr = U.minimize_and_clip(optimizer,
                                                weighted_error,
                                                var_list=q_func_vars,
                                                clip_val=grad_norm_clipping)
        else:
            optimize_expr = optimizer.minimize(weighted_error,
                                               var_list=q_func_vars)

        # update_target_fn will be called periodically to copy Q network to target Q network
        update_target_expr = []
        for var, var_target in zip(
                sorted(q_func_vars, key=lambda v: v.name),
                sorted(target_q_func_vars, key=lambda v: v.name)):
            update_target_expr.append(var_target.assign(var))
        update_target_expr = tf.group(*update_target_expr)

        # Create callable functions
        train = U.function(inputs=[
            obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph,
            importance_weights_ph
        ],
                           outputs=td_error,
                           updates=[optimize_expr])

        # Create callable functions
        trainWOUpdate = U.function(
            inputs=[
                obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph,
                importance_weights_ph
            ],
            outputs=[q_t_selected_target_expanded, errors, td_error])

        update_target = U.function([], [], updates=[update_target_expr])

        q_values = U.function([obs_t_input], q_t)

        #        return act_f, train, update_target, {'q_values': q_values}
        #        return getq_f, train, trainWOUpdate, update_target, {'q_values': q_values}
        #        return getq_f, train, trainWOUpdate, {'q_values': q_values}
        return getq_f, train, trainWOUpdate, update_target, {
            'q_values': q_values
        }
def build_act(make_obs_ph, q_func, num_actions, scope="deepq", reuse=None):
    """Creates the act function:

    Parameters
    ----------
    make_obs_ph: str -> tf.placeholder or TfInput
        a function that take a name and creates a placeholder of input with that name
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    num_actions: int
        number of actions.
    scope: str or VariableScope
        optional scope for variable_scope.
    reuse: bool or None
        whether or not the variables should be reused. To be able to reuse the scope must be given.

    Returns
    -------
    act: (tf.Variable, bool, float) -> tf.Variable
        function to select and action given observation.
`       See the top of the file for details.
    """
    with tf.variable_scope(scope, reuse=reuse):
        observations_ph = U.ensure_tf_input(make_obs_ph("observation"))
        stochastic_ph = tf.placeholder(tf.bool, (), name="stochastic")
        update_eps_ph = tf.placeholder(tf.float32, (), name="update_eps")

        eps = tf.get_variable("eps", (),
                              initializer=tf.constant_initializer(0))

        q_values = q_func(observations_ph.get(), num_actions, scope="q_func")
        deterministic_actions = tf.argmax(q_values, axis=1)

        batch_size = tf.shape(observations_ph.get())[0]
        random_actions = tf.random_uniform(tf.stack([batch_size]),
                                           minval=0,
                                           maxval=num_actions,
                                           dtype=tf.int64)
        chose_random = tf.random_uniform(
            tf.stack([batch_size]), minval=0, maxval=1, dtype=tf.float32) < eps
        stochastic_actions = tf.where(chose_random, random_actions,
                                      deterministic_actions)

        output_actions = tf.cond(stochastic_ph, lambda: stochastic_actions,
                                 lambda: deterministic_actions)
        update_eps_expr = eps.assign(
            tf.cond(update_eps_ph >= 0, lambda: update_eps_ph, lambda: eps))

        act = U.function(
            inputs=[observations_ph, stochastic_ph, update_eps_ph],
            outputs=output_actions,
            givens={
                update_eps_ph: -1.0,
                stochastic_ph: True
            },
            updates=[update_eps_expr])
        return act
Example #20
0
def build_train_cascaded(make_obs_ph,
                         make_target_ph,
                         make_actions_ph,
                         q_func,
                         num_cascade,
                         num_actions,
                         batch_size,
                         num_deictic_patches,
                         optimizer,
                         gamma=1.0,
                         grad_norm_clipping=None,
                         double_q=True,
                         scope="deepq",
                         reuse=None):

    getq_f = build_getq(make_obs_ph,
                        q_func,
                        num_actions * num_cascade,
                        scope=scope,
                        scope_q_func="q_func",
                        reuse=reuse)
    #    getq_f_target = build_getq(make_obs_ph, q_func, num_actions * num_cascade, scope=scope, scope_q_func="target_q_func", reuse=reuse)

    with tf.variable_scope(scope, reuse=reuse):

        # set up placeholders
        obs_t_input = U.ensure_tf_input(make_obs_ph("obs_t"))
        actions_input = U.ensure_tf_input(make_actions_ph("actions"))
        target_input = U.ensure_tf_input(make_target_ph("target"))

        # get variables
        q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))
        target_q_func_vars = U.scope_vars(
            U.absolute_scope_name("target_q_func"))

        # q values for all actions
        q_t_raw = q_func(obs_t_input.get(),
                         num_actions * num_cascade,
                         scope="q_func",
                         reuse=True)  # reuse parameters from act
        q_t = tf.reshape(
            q_t_raw,
            [batch_size * num_deictic_patches, num_cascade, num_actions])

        # q values for selected actions
        actionsTiled = tf.one_hot(actions_input.get(), num_actions)
        q_t_action_select = tf.reduce_sum(q_t * actionsTiled, 2)

        # calculate error
        td_error = q_t_action_select - tf.stop_gradient(target_input.get())
        errors = U.huber_loss(td_error)

        # compute optimization op (potentially with gradient clipping)
        if grad_norm_clipping is not None:
            optimize_expr = U.minimize_and_clip(optimizer,
                                                errors,
                                                var_list=q_func_vars,
                                                clip_val=grad_norm_clipping)
        else:
            optimize_expr = optimizer.minimize(errors, var_list=q_func_vars)

#        # update_target_fn will be called periodically to copy Q network to target Q network
#        update_target_expr = []
#        for var, var_target in zip(sorted(q_func_vars, key=lambda v: v.name),
#                                   sorted(target_q_func_vars, key=lambda v: v.name)):
#            update_target_expr.append(var_target.assign(var))
#        update_target_expr = tf.group(*update_target_expr)

# Create callable functions
        targetTrain = U.function(
            inputs=[obs_t_input, actions_input, target_input],
            outputs=[td_error, q_t_action_select,
                     target_input.get()],
            updates=[optimize_expr])

#        update_target = U.function([], [], updates=[update_target_expr])

#    return getq_f, getq_f_target, targetTrain, update_target
    return getq_f, targetTrain
def build_train(make_obs_ph,
                q_func,
                num_actions,
                optimizer,
                grad_norm_clipping=None,
                gamma=1.0,
                double_q=True,
                scope="deepq",
                reuse=None):
    """Creates the train function:

    Parameters
    ----------
    make_obs_ph: str -> tf.placeholder or TfInput
        a function that takes a name and creates a placeholder of input with that name
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    num_actions: int
        number of actions
    reuse: bool
        whether or not to reuse the graph variables
    optimizer: tf.train.Optimizer
        optimizer to use for the Q-learning objective.
    grad_norm_clipping: float or None
        clip gradient norms to this value. If None no clipping is performed.
    gamma: float
        discount rate.
    double_q: bool
        if true will use Double Q Learning (https://arxiv.org/abs/1509.06461).
        In general it is a good idea to keep it enabled.
    scope: str or VariableScope
        optional scope for variable_scope.
    reuse: bool or None
        whether or not the variables should be reused. To be able to reuse the scope must be given.

    Returns
    -------
    act: (tf.Variable, bool, float) -> tf.Variable
        function to select and action given observation.
`       See the top of the file for details.
    train: (object, np.array, np.array, object, np.array, np.array) -> np.array
        optimize the error in Bellman's equation.
`       See the top of the file for details.
    update_target: () -> ()
        copy the parameters from optimized Q function to the target Q function.
`       See the top of the file for details.
    debug: {str: function}
        a bunch of functions to print debug data like q_values.
    """
    act_f = build_act(make_obs_ph,
                      q_func,
                      num_actions,
                      scope=scope,
                      reuse=reuse)

    with tf.variable_scope(scope, reuse=reuse):
        # set up placeholders
        obs_t_input = U.ensure_tf_input(make_obs_ph("obs_t"))
        act_t_ph = tf.placeholder(tf.int32, [None], name="action")
        rew_t_ph = tf.placeholder(tf.float32, [None], name="reward")
        obs_tp1_input = U.ensure_tf_input(make_obs_ph("obs_tp1"))
        done_mask_ph = tf.placeholder(tf.float32, [None], name="done")
        importance_weights_ph = tf.placeholder(tf.float32, [None],
                                               name="weight")

        # q network evaluation
        q_t = q_func(obs_t_input.get(),
                     num_actions,
                     scope="q_func",
                     reuse=True)  # reuse parameters from act
        q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))

        # target q network evalution
        q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func")
        target_q_func_vars = U.scope_vars(
            U.absolute_scope_name("target_q_func"))

        # q scores for actions which we know were selected in the given state.
        q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions),
                                     1)

        # compute estimate of best possible value starting from state at t + 1
        if double_q:
            q_tp1_using_online_net = q_func(obs_tp1_input.get(),
                                            num_actions,
                                            scope="q_func",
                                            reuse=True)
            q_tp1_best_using_online_net = tf.arg_max(q_tp1_using_online_net, 1)
            q_tp1_best = tf.reduce_sum(
                q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions),
                1)
        else:
            q_tp1_best = tf.reduce_max(q_tp1, 1)
        q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best

        # compute RHS of bellman equation
        q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked

        # compute the error (potentially clipped)
        td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
        errors = U.huber_loss(td_error)
        weighted_error = tf.reduce_mean(importance_weights_ph * errors)
        # compute optimization op (potentially with gradient clipping)
        if grad_norm_clipping is not None:
            optimize_expr = U.minimize_and_clip(optimizer,
                                                weighted_error,
                                                var_list=q_func_vars,
                                                clip_val=grad_norm_clipping)
        else:
            optimize_expr = optimizer.minimize(weighted_error,
                                               var_list=q_func_vars)

        # update_target_fn will be called periodically to copy Q network to target Q network
        update_target_expr = []
        for var, var_target in zip(
                sorted(q_func_vars, key=lambda v: v.name),
                sorted(target_q_func_vars, key=lambda v: v.name)):
            update_target_expr.append(var_target.assign(var))
        update_target_expr = tf.group(*update_target_expr)

        # Create callable functions
        train = U.function(inputs=[
            obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph,
            importance_weights_ph
        ],
                           outputs=td_error,
                           updates=[optimize_expr])
        update_target = U.function([], [], updates=[update_target_expr])

        q_values = U.function([obs_t_input], q_t)

        return act_f, train, update_target, {'q_values': q_values}
def build_train_deictic_min_streamlined(make_obs_ph,
                                        q_func,
                                        num_actions,
                                        batch_size,
                                        num_deictic_patches,
                                        max_num_groups,
                                        optimizer,
                                        gamma=1.0,
                                        grad_norm_clipping=None,
                                        double_q=True,
                                        scope="deepq",
                                        reuse=None):

    #    act_f = build_act(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse)
    getq_f = build_getq(make_obs_ph,
                        q_func,
                        num_actions,
                        scope=scope,
                        reuse=reuse)

    with tf.variable_scope(scope, reuse=reuse):

        # set up placeholders
        obs_t_input = U.ensure_tf_input(make_obs_ph("obs_t"))
        act_t_ph = tf.placeholder(tf.int32, [None], name="action")
        obs_tp1_input = U.ensure_tf_input(make_obs_ph("obs_tp1"))
        rew_t_ph = tf.placeholder(tf.float32, [None], name="reward")
        group_matching_ph = tf.placeholder(tf.int32, [None],
                                           name="group_matching")
        done_mask_ph = tf.placeholder(tf.float32, [None], name="done_mask")

        # Creating this placeholder to enable a tabular version of this code
        q_tp1_ph = tf.placeholder(tf.float32, [None, 4], name="q_tp1")

        # q network evaluation
        q_t = q_func(obs_t_input.get(),
                     num_actions,
                     scope="q_func",
                     reuse=True)  # reuse parameters from act
        q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))

        # target q network evalution
        #        q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func")
        q_tp1 = q_tp1_ph
        target_q_func_vars = U.scope_vars(
            U.absolute_scope_name("target_q_func"))

        # compute estimate of best possible value starting from state at t + 1
        if double_q:
            q_tp1_using_online_net = q_func(obs_tp1_input.get(),
                                            num_actions,
                                            scope="q_func",
                                            reuse=True)
            q_tp1_best_using_online_net = tf.arg_max(q_tp1_using_online_net, 1)
            q_tp1_best = tf.reduce_sum(
                q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions),
                1)
        else:
            q_tp1_maxa = tf.reduce_max(q_tp1, 1)

        # Calculate target = max_{a,d} Q(d,a). Size should be: Bx1
        q_tp1_maxa_reshape = tf.reshape(q_tp1_maxa,
                                        [batch_size, num_deictic_patches])
        q_tp1_target = tf.reduce_max(q_tp1_maxa_reshape, 1)
        q_tp1_target = rew_t_ph + gamma * q_tp1_target
        q_tp1_target = (1.0 - done_mask_ph) * q_tp1_target

        # Calculate desc_2_state. Encodes which descriptors contained in each state.
        # Dimensions should be: B x max_num_groups
        group_matching_3dtensor = tf.reshape(group_matching_ph,
                                             [batch_size, num_deictic_patches])
        groups_onehot = tf.one_hot(group_matching_3dtensor,
                                   max_num_groups,
                                   axis=-1)
        desc_2_state = tf.reduce_max(groups_onehot, 1)

        # Calculate target_min_per_D
        max_target = tf.reduce_max(q_tp1_target)
        q_tp1_target_tiled = tf.tile(tf.reshape(q_tp1_target, [batch_size, 1]),
                                     [1, max_num_groups])
        target_min_per_D_expanded = desc_2_state * q_tp1_target_tiled + (
            1 - desc_2_state) * max_target
        target_min_per_D = tf.reduce_min(target_min_per_D_expanded, 0)

        # Project target_min_per_D onto q_t
        D_2_DI = tf.one_hot(group_matching_ph, max_num_groups)

        # Both the next two lines produce the same results
        targets = tf.squeeze(
            tf.matmul(D_2_DI, tf.reshape(target_min_per_D,
                                         [max_num_groups, 1])))
        #        targets2 = tf.reduce_sum(D_2_DI * tf.tile(tf.reshape(target_min_per_D,[1,max_num_groups]),[batch_size*num_deictic_patches,1]),1)

        #        tf.tile(tf.reshape(target_min_per_D,[max_num_groups,1]),[1,np.shape(q_t)[0]])

        #        # expand target to BxD(I')
        #        q_tp1_target_under_tiled = tf.tile(tf.reshape(q_tp1_target_under,[batch_size,1]),[1,num_deictic_patches])
        #        q_tp1_target = tf.reshape(q_tp1_target_under_tiled,[batch_size*num_deictic_patches,1])
        #        # Calculate min over groups
        #        groups_onehot = tf.one_hot(group_matching_ph,max_num_groups+1)
        #        q_tp1_target_tiled = tf.tile(q_tp1_target,[1,max_num_groups+1])
        #        max_target = tf.reduce_max(q_tp1_target)
        #        groups_target_tiled = groups_onehot*q_tp1_target_tiled + (1-groups_onehot)*max_target
        #        groups_target = tf.reduce_min(groups_target_tiled,0)[:-1]
        #        groups_target_reduced = groups_target[0:tf.minimum(tf.shape(group_actions_ph)[0],tf.shape(groups_target)[0])]

        # q scores for actions which we know were selected in the given state.
        q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions),
                                     1)

        # compute the error (potentially clipped)
        #        td_error = q_group_selected - tf.stop_gradient(groups_target_reduced)
        td_error = q_t_selected - tf.stop_gradient(targets)
        errors = U.huber_loss(td_error)
        weighted_error = errors

        # compute optimization op (potentially with gradient clipping)
        if grad_norm_clipping is not None:
            optimize_expr = U.minimize_and_clip(optimizer,
                                                weighted_error,
                                                var_list=q_func_vars,
                                                clip_val=grad_norm_clipping)
        else:
            optimize_expr = optimizer.minimize(weighted_error,
                                               var_list=q_func_vars)

        # update_target_fn will be called periodically to copy Q network to target Q network
        update_target_expr = []
        for var, var_target in zip(
                sorted(q_func_vars, key=lambda v: v.name),
                sorted(target_q_func_vars, key=lambda v: v.name)):
            update_target_expr.append(var_target.assign(var))
        update_target_expr = tf.group(*update_target_expr)

        # Create callable functions
        train = U.function(inputs=[
            obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, group_matching_ph,
            done_mask_ph
        ],
                           outputs=[targets],
                           updates=[optimize_expr])

        # Create callable functions
        trainWOUpdate = U.function(inputs=[
            obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, group_matching_ph,
            done_mask_ph, q_tp1_ph
        ],
                                   outputs=[
                                       q_tp1_target, desc_2_state,
                                       target_min_per_D, D_2_DI, targets
                                   ])

        update_target = U.function([], [], updates=[update_target_expr])
        #        q_values = U.function([obs_t_input], q_t)

        #        return getq_f, train, trainWOUpdate, update_target, {'q_values': q_values}
        return getq_f, train, trainWOUpdate, update_target