def build_targetTrain_DQN(make_obs_ph, make_target_ph, q_func, num_actions, optimizer, scope="deepq", reuse=None): with tf.variable_scope(scope, reuse=reuse): # set up placeholders obs_t_input = U.ensure_tf_input(make_obs_ph("obs_t")) target_input = U.ensure_tf_input(make_target_ph("target")) # get variables q_func_vars = U.scope_vars(U.absolute_scope_name("q_func")) # q values for all actions q_t_raw = q_func(obs_t_input.get(), num_actions, scope="q_func", reuse=True) # calculate error td_error = q_t_raw - tf.stop_gradient(target_input.get()) errors = U.huber_loss(td_error) optimize_expr = optimizer.minimize(errors, var_list=q_func_vars) targetTrain = U.function(inputs=[obs_t_input, target_input], outputs=[td_error], updates=[optimize_expr]) return targetTrain
def build_targetTrain(make_actionDeic_ph, make_target_ph, make_weight_ph, q_func, num_states, num_cascade, optimizer, scope="deepq", qscope="q_func", grad_norm_clipping=None, reuse=None): with tf.variable_scope(scope, reuse=reuse): # set up placeholders obs_t_input = U.ensure_tf_input(make_actionDeic_ph("action_t_deic")) target_input = U.ensure_tf_input(make_target_ph("target")) importance_weights_ph = U.ensure_tf_input(make_weight_ph("target")) # get variables q_func_vars = U.scope_vars(U.absolute_scope_name(qscope)) # q values for all actions # q_t_raw = q_func(obs_t_input.get(), num_states*num_cascade, scope=qscope, reuse=True) # targetTiled = tf.reshape(target_input.get(), shape=(-1,num_cascade*num_states)) # q_t_raw = q_func(obs_t_input.get(), num_states, scope=qscope, reuse=True) # targetTiled = tf.reshape(target_input.get(), shape=(-1,num_states)) q_t_raw = q_func(obs_t_input.get(), 1, scope=qscope, reuse=True) targetTiled = tf.reshape(target_input.get(), shape=(-1,1)) # calculate error td_error = q_t_raw - tf.stop_gradient(targetTiled) errors = importance_weights_ph.get() * U.huber_loss(td_error) # compute optimization op (potentially with gradient clipping) if grad_norm_clipping is not None: optimize_expr = U.minimize_and_clip(optimizer, errors, var_list=q_func_vars, clip_val=grad_norm_clipping) else: optimize_expr = optimizer.minimize(errors, var_list=q_func_vars) # optimize_expr = optimizer.minimize(errors, var_list=q_func_vars) targetTrain = U.function( inputs=[ obs_t_input, target_input, importance_weights_ph ], outputs=[td_error, obs_t_input.get(), target_input.get()], updates=[optimize_expr] ) return targetTrain
def build_getq(make_deic_ph, q_func, scope="deepq", qscope="q_func", reuse=None): with tf.variable_scope(scope, reuse=reuse): actions_ph = U.ensure_tf_input(make_deic_ph("stateaction")) q_values = q_func(actions_ph.get(), 1, scope=qscope) getq = U.function(inputs=[actions_ph], outputs=q_values) return getq
def build_getq_fullstate(make_fullImage_ph, q_func, num_actions, num_cascade, scope="deepq", qscope="q_func", reuse=None): with tf.variable_scope(scope, reuse=reuse): state_ph = U.ensure_tf_input(make_fullImage_ph("state")) q_values = q_func(state_ph.get(), num_actions, scope=qscope) getq = U.function(inputs=[state_ph], outputs=q_values) return getq
def build_getDeic_FocCoarse(make_obs_ph,deicticShape): observations_ph = U.ensure_tf_input(make_obs_ph("observation")) # create padded image obs = observations_ph.get() shape = tf.shape(obs) obsZeroPadded = tf.image.resize_image_with_crop_or_pad(obs,shape[1]+2*deicticShape[0],shape[2]+2*deicticShape[0]) # extract large patches from padded image patchesLarge = tf.extract_image_patches( obsZeroPadded, ksizes=[1, 3*deicticShape[0], 3*deicticShape[1], 1], strides=[1, 1, 1, 1], rates=[1, 1, 1, 1], padding='VALID') # reshape into focused and large images patchesShape = tf.shape(patchesLarge) patchesTiledLarge = tf.reshape(patchesLarge,[patchesShape[0]*patchesShape[1]*patchesShape[2],3*deicticShape[0],3*deicticShape[1],1]) patchesTiledFocused = patchesTiledLarge[:,deicticShape[0]:2*deicticShape[0],deicticShape[1]:2*deicticShape[1],0] # get two coarse images: one for agent and one for the ghost (might make this more efficient by doing the resize only once...) coarseAgent = tf.image.resize_area(tf.cast(tf.equal(patchesTiledLarge,1.),tf.int32), deicticShape[0:2])[:,:,:,0] > 0 coarseGhost = tf.image.resize_area(tf.cast(tf.equal(patchesTiledLarge,2.),tf.int32), deicticShape[0:2])[:,:,:,0] > 0 patchesTiledStacked = tf.stack([tf.equal(patchesTiledFocused,1), tf.equal(patchesTiledFocused,2), coarseAgent, coarseGhost],axis=-1) getDeic = U.function(inputs=[observations_ph], outputs=patchesTiledStacked) return getDeic
def build_getMoveActionDescriptors(make_obs_ph,deicticShape): if (deicticShape[0] % 2 == 0) or (deicticShape[1] % 2 == 0): print("build_getActionDescriptors ERROR: first two elts of deicticShape must by odd") observations_ph = U.ensure_tf_input(make_obs_ph("observation")) obs = observations_ph.get() shape = tf.shape(obs) deicticPad = np.floor(np.array(deicticShape)-1) obsZeroPadded = tf.image.resize_image_with_crop_or_pad(obs,shape[1]+deicticPad[0],shape[2]+deicticPad[1]) patches = tf.extract_image_patches( # observations_ph.get(), obsZeroPadded, ksizes=[1, deicticShape[0], deicticShape[1], 1], strides=[1, 1, 1, 1], rates=[1, 1, 1, 1], padding='VALID') patchesShape = tf.shape(patches) patchesTiled = tf.reshape(patches,[patchesShape[0]*patchesShape[1]*patchesShape[2],deicticShape[0],deicticShape[1]]) # patchesTiledStacked = tf.stack([tf.equal(patchesTiled,1), tf.equal(patchesTiled,2)],axis=-1) # X,Y = tf.meshgrid(tf.range(shape[1]),tf.range(shape[2])) # moveActions = tf.stack([tf.reshape(Y,[shape[1]*shape[2],]), tf.reshape(X,[shape[1]*shape[2],])],axis=0) # getMoveActionDescriptors = U.function(inputs=[observations_ph], outputs=patchesTiledStacked) getMoveActionDescriptors = U.function(inputs=[observations_ph], outputs=patchesTiled) return getMoveActionDescriptors
def build_getMoveActionDescriptors(make_obs_ph, actionShape, actionShapeSmall, stride): observations_ph = U.ensure_tf_input(make_obs_ph("observation")) obs = observations_ph.get() shape = tf.shape(obs) deicticPad = np.int32(2 * np.floor(np.array(actionShape) / 3)) obsZeroPadded = tf.image.resize_image_with_crop_or_pad( obs, shape[1] + deicticPad[0], shape[2] + deicticPad[1]) patches = tf.extract_image_patches( obsZeroPadded, # obs, ksizes=[1, actionShape[0], actionShape[1], 1], # strides=[1, deicticPad[0]/2, deicticPad[1]/2, 1], strides=[1, stride, stride, 1], rates=[1, 1, 1, 1], padding='VALID') patchesShape = tf.shape(patches) patchesTiled = tf.reshape(patches, [ patchesShape[0] * patchesShape[1] * patchesShape[2], actionShape[0], actionShape[1], 1 ]) patchesTiledSmall = tf.image.resize_images( patchesTiled, [actionShapeSmall[0], actionShapeSmall[1]]) patchesTiledSmall = tf.reshape( patchesTiledSmall, [-1, actionShapeSmall[0], actionShapeSmall[1]]) getMoveActionDescriptors = U.function(inputs=[observations_ph], outputs=patchesTiledSmall) return getMoveActionDescriptors
def build_getMoveActionDescriptorsRot(make_obs_ph,actionShape,actionShapeSmall,stride): observations_ph = U.ensure_tf_input(make_obs_ph("observation")) obs = observations_ph.get() shape = tf.shape(obs) deicticPad = np.int32(2*np.floor(np.array(actionShape)/3)) obsZeroPadded = tf.image.resize_image_with_crop_or_pad(obs,shape[1]+deicticPad[0],shape[2]+deicticPad[1]) patches = tf.extract_image_patches( obsZeroPadded, ksizes=[1, actionShape[0], actionShape[1], 1], strides=[1, stride, stride, 1], rates=[1, 1, 1, 1], padding='VALID') patchesShape = tf.shape(patches) patchesTiled = tf.reshape(patches,[patchesShape[0]*patchesShape[1]*patchesShape[2],actionShape[0],actionShape[1],1]) patchesTiledRot0 = patchesTiled patchesTiledRot1 = tf.contrib.image.rotate(patchesTiled,np.pi/4) patchesTiledRot2 = tf.contrib.image.rotate(patchesTiled,2*np.pi/4) patchesTiledRot3 = tf.contrib.image.rotate(patchesTiled,3*np.pi/4) # patchesTiledAll = tf.concat([patchesTiled,patchesTiled,patchesTiled,patchesTiled],axis=0) # patchesTiledAll = tf.concat([patchesTiledRot0,patchesTiledRot0,patchesTiledRot2,patchesTiledRot2],axis=0) # patchesTiledAll = tf.concat([patchesTiledRot1,patchesTiledRot1,patchesTiledRot3,patchesTiledRot3],axis=0) patchesTiledAll = tf.concat([patchesTiledRot0,patchesTiledRot1,patchesTiledRot2,patchesTiledRot3],axis=0) patchesTiledSmall = tf.image.resize_images(patchesTiledAll, [actionShapeSmall[0], actionShapeSmall[1]]) patchesTiledSmall = tf.reshape(patchesTiledSmall,[-1,actionShapeSmall[0],actionShapeSmall[1]]) getMoveActionDescriptors = U.function(inputs=[observations_ph], outputs=patchesTiledSmall) return getMoveActionDescriptors
def build_train_cascaded(make_obs_ph, make_target_ph, q_func, num_cascade, num_actions, optimizer, grad_norm_clipping=None, double_q=True, scope="deepq", reuse=None): getq_f = build_getq(make_obs_ph, q_func, num_actions, num_cascade, scope=scope, reuse=reuse) with tf.variable_scope(scope, reuse=reuse): # set up placeholders obs_t_input = U.ensure_tf_input(make_obs_ph("obs_t")) target_input = U.ensure_tf_input(make_target_ph("target")) # get variables q_func_vars = U.scope_vars(U.absolute_scope_name("q_func")) # q values for all actions q_t_raw = q_func(obs_t_input.get(), num_actions * num_cascade, scope="q_func", reuse=True) q_t = tf.reshape(q_t_raw, shape=(-1, num_cascade, num_actions)) # calculate error td_error = q_t - tf.stop_gradient(target_input.get()) errors = U.huber_loss(td_error) optimize_expr = optimizer.minimize(errors, var_list=q_func_vars) targetTrain = U.function( inputs=[obs_t_input, target_input], outputs=[td_error, obs_t_input.get(), target_input.get()], updates=[optimize_expr]) return getq_f, targetTrain
def build_get_2channelobs(make_obs_ph): observations_ph = U.ensure_tf_input(make_obs_ph("observation")) patchesTiledStacked = tf.stack([ tf.equal(observations_ph.get()[:, :, :, 0], 1), tf.equal(observations_ph.get()[:, :, :, 0], 2) ], axis=-1) getDeic = U.function(inputs=[observations_ph], outputs=patchesTiledStacked) return getDeic
def build_getq(make_obsDeic_ph, q_func, num_actions, num_cascade, scope="deepq", qscope="q_func", reuse=None): with tf.variable_scope(scope, reuse=reuse): observations_ph = U.ensure_tf_input(make_obsDeic_ph("observation")) q_values = q_func(observations_ph.get(), num_actions*num_cascade, scope=qscope) q_valuesTiled = tf.reshape(q_values,[-1,num_cascade,num_actions]) getq = U.function(inputs=[observations_ph], outputs=q_valuesTiled) return getq
def build_getMoveActionDescriptorsRot(make_obs_ph, actionShape, actionShapeSmall, stride, numOrientations): observations_ph = U.ensure_tf_input(make_obs_ph("observation")) obs = observations_ph.get() shape = tf.shape(obs) deicticPad = np.int32(2 * np.floor(np.array(actionShape) / 3)) obsZeroPadded = tf.image.resize_image_with_crop_or_pad( obs, shape[1] + deicticPad[0], shape[2] + deicticPad[1]) patches = tf.extract_image_patches( obsZeroPadded, ksizes=[1, actionShape[0], actionShape[1], 1], strides=[1, stride, stride, 1], rates=[1, 1, 1, 1], padding='VALID') patchesShape = tf.shape(patches) patchesTiled = tf.reshape(patches, [ patchesShape[0] * patchesShape[1] * patchesShape[2], actionShape[0], actionShape[1], 1 ]) patchesTiledRot0 = patchesTiled patchesTiledRot1 = tf.contrib.image.rotate(patchesTiled, np.pi / 8) patchesTiledRot2 = tf.contrib.image.rotate(patchesTiled, 2 * np.pi / 8) patchesTiledRot3 = tf.contrib.image.rotate(patchesTiled, 3 * np.pi / 8) patchesTiledRot4 = tf.contrib.image.rotate(patchesTiled, 4 * np.pi / 8) patchesTiledRot5 = tf.contrib.image.rotate(patchesTiled, 5 * np.pi / 8) patchesTiledRot6 = tf.contrib.image.rotate(patchesTiled, 6 * np.pi / 8) patchesTiledRot7 = tf.contrib.image.rotate(patchesTiled, 7 * np.pi / 8) if numOrientations == 4: patchesTiledAll = tf.concat([ patchesTiledRot0, patchesTiledRot0, patchesTiledRot2, patchesTiledRot2, patchesTiledRot4, patchesTiledRot4, patchesTiledRot6, patchesTiledRot6 ], axis=0) elif numOrientations == 8: patchesTiledAll = tf.concat([ patchesTiledRot0, patchesTiledRot1, patchesTiledRot2, patchesTiledRot3, patchesTiledRot4, patchesTiledRot5, patchesTiledRot6, patchesTiledRot7 ], axis=0) else: print('ERROR: invalid number of orientations') patchesTiledSmall = tf.image.resize_images( patchesTiledAll, [actionShapeSmall[0], actionShapeSmall[1]]) patchesTiledSmall = tf.reshape( patchesTiledSmall, [-1, actionShapeSmall[0], actionShapeSmall[1]]) getMoveActionDescriptors = U.function(inputs=[observations_ph], outputs=patchesTiledSmall) return getMoveActionDescriptors
def build_getq_DQN(make_obs_ph, q_func, num_actions, scope="deepq", reuse=None): with tf.variable_scope(scope, reuse=reuse): observations_ph = U.ensure_tf_input(make_obs_ph("observation")) q_values = q_func(observations_ph.get(), num_actions, scope="q_func") getq = U.function(inputs=[observations_ph], outputs=q_values) return getq
def build_getq(make_actionDeic_ph, q_func, num_states, num_cascade, scope="deepq", qscope="q_func", reuse=None): with tf.variable_scope(scope, reuse=reuse): actions_ph = U.ensure_tf_input(make_actionDeic_ph("actions")) # q_values = q_func(actions_ph.get(), num_states*num_cascade, scope=qscope) # q_valuesTiled = tf.reshape(q_values,[-1,num_cascade,num_states]) q_values = q_func(actions_ph.get(), num_states, scope=qscope) q_valuesTiled = tf.reshape(q_values,[-1,num_states]) getq = U.function(inputs=[actions_ph], outputs=q_valuesTiled) return getq
def build_getDeic_Foc(make_obs_ph,deicticShape): observations_ph = U.ensure_tf_input(make_obs_ph("observation")) patches = tf.extract_image_patches( observations_ph.get(), ksizes=[1, deicticShape[0], deicticShape[1], 1], strides=[1, 1, 1, 1], rates=[1, 1, 1, 1], padding='VALID') patchesShape = tf.shape(patches) patchesTiled = tf.reshape(patches,[patchesShape[0]*patchesShape[1]*patchesShape[2],deicticShape[0],deicticShape[1]]) patchesTiledStacked = tf.stack([tf.equal(patchesTiled,1), tf.equal(patchesTiled,2)],axis=-1) getDeic = U.function(inputs=[observations_ph], outputs=patchesTiledStacked) return getDeic
def build_getMoveActionDescriptorsRot(make_obs_ph, patchSize, handSize, smallSize, stride): observations_ph = U.ensure_tf_input(make_obs_ph("observation")) obs = observations_ph.get() origImShape = tf.shape(obs) patchExpanded = np.int32(patchSize * np.sqrt(2)) + 1 amt2Pad = patchExpanded - handSize obsZeroPadded = tf.image.resize_image_with_crop_or_pad( obs, origImShape[1] + amt2Pad, origImShape[2] + amt2Pad) patches = tf.extract_image_patches( obsZeroPadded, ksizes=[1, patchExpanded, patchExpanded, 1], strides=[1, stride, stride, 1], rates=[1, 1, 1, 1], padding='VALID') patchesShape = tf.shape(patches) patchesTiled = tf.reshape(patches, [ patchesShape[0] * patchesShape[1] * patchesShape[2], patchExpanded, patchExpanded, 1 ]) patchesTiledRot0 = tf.contrib.image.rotate(patchesTiled, 0) patchesTiledRot1 = tf.contrib.image.rotate(patchesTiled, np.pi / 4) patchesTiledRot2 = tf.contrib.image.rotate(patchesTiled, 2 * np.pi / 4) patchesTiledRot3 = tf.contrib.image.rotate(patchesTiled, 3 * np.pi / 4) # patchesTiledAll = tf.concat([patchesTiledRot0,patchesTiledRot1],axis=0) patchesTiledAll = tf.concat([ patchesTiledRot0, patchesTiledRot1, patchesTiledRot2, patchesTiledRot3 ], axis=0) # patchesTiledAll = tf.concat([patchesTiledRot1,patchesTiledRot1,patchesTiledRot3,patchesTiledRot3],axis=0) # patchesTiledAll = tf.concat([patchesTiledRot0,patchesTiledRot0,patchesTiledRot2,patchesTiledRot2],axis=0) # patchesTiledAll = patchesTiled patchesTiledRotCrop = tf.image.resize_image_with_crop_or_pad( patchesTiledAll, patchSize, patchSize) patchesTiledSmall = tf.image.resize_images(patchesTiledRotCrop, [smallSize, smallSize]) patchesTiledSmall = tf.reshape(patchesTiledSmall, [-1, smallSize, smallSize]) # getMoveActionDescriptors = U.function(inputs=[observations_ph], outputs=[obsZeroPadded, patches, patchesTiledSmall]) getMoveActionDescriptors = U.function(inputs=[observations_ph], outputs=patchesTiledSmall) return getMoveActionDescriptors
def build_getDeic(make_obs_ph, deicticShape): observations_ph = U.ensure_tf_input(make_obs_ph("observation")) patches = tf.extract_image_patches( observations_ph.get(), ksizes=[1, deicticShape[0], deicticShape[1], 1], strides=[1, 1, 1, 1], rates=[1, 1, 1, 1], padding='VALID') patchesShape = tf.shape(patches) patchesTiled = tf.reshape(patches, [ patchesShape[0] * patchesShape[1] * patchesShape[2], deicticShape[0], deicticShape[1] ]) coarse = tf.image.resize_area(observations_ph.get(), (4, 4)) * 4 coarseTiled = tf.transpose( tf.tile([coarse], [patchesShape[1] * patchesShape[2], 1, 1, 1, 1]), [1, 0, 2, 3, 4]) coarseTiledReshape = tf.reshape(coarseTiled, [ patchesShape[0] * patchesShape[1] * patchesShape[2], deicticShape[0], deicticShape[1] ]) # patchesTiledStacked = tf.stack([tf.cast(tf.equal(patchesTiled,1),tf.float32), tf.cast(tf.equal(patchesTiled,2),tf.float32)],axis=-1) patchesTiledStacked = tf.stack([ tf.cast(tf.equal(patchesTiled, 1), tf.float32), tf.cast(tf.equal(patchesTiled, 2), tf.float32), coarseTiledReshape ], axis=-1) # getDeic = U.function(inputs=[observations_ph], outputs=[patchesTiledStacked, coarseTiledReshape, patchesTiled, patchesTiledStacked2]) # getDeic = U.function(inputs=[observations_ph], outputs=[patchesTiledStacked, patchesTiledStacked2]) getDeic = U.function(inputs=[observations_ph], outputs=patchesTiledStacked) return getDeic
def build_train_deictic(make_obs_ph, q_func, num_actions, optimizer, grad_norm_clipping=None, gamma=1.0, double_q=True, scope="deepq", reuse=None): # act_f = build_act(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse) getq_f = build_getq(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse) with tf.variable_scope(scope, reuse=reuse): # set up placeholders obs_t_input = U.ensure_tf_input(make_obs_ph("obs_t")) act_t_ph = tf.placeholder(tf.int32, [None], name="action") rew_t_ph = tf.placeholder(tf.float32, [None], name="reward") obs_tp1_input = U.ensure_tf_input(make_obs_ph("obs_tp1")) done_mask_ph = tf.placeholder(tf.float32, [None], name="done") importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight") # q network evaluation q_t = q_func(obs_t_input.get(), num_actions, scope="q_func", reuse=True) # reuse parameters from act q_func_vars = U.scope_vars(U.absolute_scope_name("q_func")) # target q network evalution q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func") target_q_func_vars = U.scope_vars( U.absolute_scope_name("target_q_func")) # q scores for actions which we know were selected in the given state. q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions), 1) # compute estimate of best possible value starting from state at t + 1 if double_q: q_tp1_using_online_net = q_func(obs_tp1_input.get(), num_actions, scope="q_func", reuse=True) q_tp1_best_using_online_net = tf.arg_max(q_tp1_using_online_net, 1) q_tp1_best = tf.reduce_sum( q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions), 1) else: q_tp1_best = tf.reduce_max(q_tp1, 1) q_tp1_best = tf.reshape(q_tp1_best, [32, 25]) q_tp1_best_reduced = tf.reduce_max(q_tp1_best, 1) # compute RHS of bellman equation q_t_selected_target = rew_t_ph + gamma * q_tp1_best_reduced q_t_selected_target_masked = (1.0 - done_mask_ph) * q_t_selected_target q_t_selected_target_tiled = tf.tile( tf.reshape(q_t_selected_target_masked, [32, 1]), [1, 25]) q_t_selected_target_expanded = tf.reshape(q_t_selected_target_tiled, [ 800, ]) # compute the error (potentially clipped) td_error = q_t_selected - tf.stop_gradient( q_t_selected_target_expanded) errors = U.huber_loss(td_error) weighted_error = tf.reduce_mean(importance_weights_ph * errors) # compute optimization op (potentially with gradient clipping) if grad_norm_clipping is not None: optimize_expr = U.minimize_and_clip(optimizer, weighted_error, var_list=q_func_vars, clip_val=grad_norm_clipping) else: optimize_expr = optimizer.minimize(weighted_error, var_list=q_func_vars) # update_target_fn will be called periodically to copy Q network to target Q network update_target_expr = [] for var, var_target in zip( sorted(q_func_vars, key=lambda v: v.name), sorted(target_q_func_vars, key=lambda v: v.name)): update_target_expr.append(var_target.assign(var)) update_target_expr = tf.group(*update_target_expr) # Create callable functions train = U.function(inputs=[ obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph, importance_weights_ph ], outputs=td_error, updates=[optimize_expr]) # Create callable functions trainWOUpdate = U.function( inputs=[ obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph, importance_weights_ph ], outputs=[q_t_selected_target_expanded, errors, td_error]) update_target = U.function([], [], updates=[update_target_expr]) q_values = U.function([obs_t_input], q_t) # return act_f, train, update_target, {'q_values': q_values} # return getq_f, train, trainWOUpdate, update_target, {'q_values': q_values} # return getq_f, train, trainWOUpdate, {'q_values': q_values} return getq_f, train, trainWOUpdate, update_target, { 'q_values': q_values }
def build_act(make_obs_ph, q_func, num_actions, scope="deepq", reuse=None): """Creates the act function: Parameters ---------- make_obs_ph: str -> tf.placeholder or TfInput a function that take a name and creates a placeholder of input with that name q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. num_actions: int number of actions. scope: str or VariableScope optional scope for variable_scope. reuse: bool or None whether or not the variables should be reused. To be able to reuse the scope must be given. Returns ------- act: (tf.Variable, bool, float) -> tf.Variable function to select and action given observation. ` See the top of the file for details. """ with tf.variable_scope(scope, reuse=reuse): observations_ph = U.ensure_tf_input(make_obs_ph("observation")) stochastic_ph = tf.placeholder(tf.bool, (), name="stochastic") update_eps_ph = tf.placeholder(tf.float32, (), name="update_eps") eps = tf.get_variable("eps", (), initializer=tf.constant_initializer(0)) q_values = q_func(observations_ph.get(), num_actions, scope="q_func") deterministic_actions = tf.argmax(q_values, axis=1) batch_size = tf.shape(observations_ph.get())[0] random_actions = tf.random_uniform(tf.stack([batch_size]), minval=0, maxval=num_actions, dtype=tf.int64) chose_random = tf.random_uniform( tf.stack([batch_size]), minval=0, maxval=1, dtype=tf.float32) < eps stochastic_actions = tf.where(chose_random, random_actions, deterministic_actions) output_actions = tf.cond(stochastic_ph, lambda: stochastic_actions, lambda: deterministic_actions) update_eps_expr = eps.assign( tf.cond(update_eps_ph >= 0, lambda: update_eps_ph, lambda: eps)) act = U.function( inputs=[observations_ph, stochastic_ph, update_eps_ph], outputs=output_actions, givens={ update_eps_ph: -1.0, stochastic_ph: True }, updates=[update_eps_expr]) return act
def build_train_cascaded(make_obs_ph, make_target_ph, make_actions_ph, q_func, num_cascade, num_actions, batch_size, num_deictic_patches, optimizer, gamma=1.0, grad_norm_clipping=None, double_q=True, scope="deepq", reuse=None): getq_f = build_getq(make_obs_ph, q_func, num_actions * num_cascade, scope=scope, scope_q_func="q_func", reuse=reuse) # getq_f_target = build_getq(make_obs_ph, q_func, num_actions * num_cascade, scope=scope, scope_q_func="target_q_func", reuse=reuse) with tf.variable_scope(scope, reuse=reuse): # set up placeholders obs_t_input = U.ensure_tf_input(make_obs_ph("obs_t")) actions_input = U.ensure_tf_input(make_actions_ph("actions")) target_input = U.ensure_tf_input(make_target_ph("target")) # get variables q_func_vars = U.scope_vars(U.absolute_scope_name("q_func")) target_q_func_vars = U.scope_vars( U.absolute_scope_name("target_q_func")) # q values for all actions q_t_raw = q_func(obs_t_input.get(), num_actions * num_cascade, scope="q_func", reuse=True) # reuse parameters from act q_t = tf.reshape( q_t_raw, [batch_size * num_deictic_patches, num_cascade, num_actions]) # q values for selected actions actionsTiled = tf.one_hot(actions_input.get(), num_actions) q_t_action_select = tf.reduce_sum(q_t * actionsTiled, 2) # calculate error td_error = q_t_action_select - tf.stop_gradient(target_input.get()) errors = U.huber_loss(td_error) # compute optimization op (potentially with gradient clipping) if grad_norm_clipping is not None: optimize_expr = U.minimize_and_clip(optimizer, errors, var_list=q_func_vars, clip_val=grad_norm_clipping) else: optimize_expr = optimizer.minimize(errors, var_list=q_func_vars) # # update_target_fn will be called periodically to copy Q network to target Q network # update_target_expr = [] # for var, var_target in zip(sorted(q_func_vars, key=lambda v: v.name), # sorted(target_q_func_vars, key=lambda v: v.name)): # update_target_expr.append(var_target.assign(var)) # update_target_expr = tf.group(*update_target_expr) # Create callable functions targetTrain = U.function( inputs=[obs_t_input, actions_input, target_input], outputs=[td_error, q_t_action_select, target_input.get()], updates=[optimize_expr]) # update_target = U.function([], [], updates=[update_target_expr]) # return getq_f, getq_f_target, targetTrain, update_target return getq_f, targetTrain
def build_train(make_obs_ph, q_func, num_actions, optimizer, grad_norm_clipping=None, gamma=1.0, double_q=True, scope="deepq", reuse=None): """Creates the train function: Parameters ---------- make_obs_ph: str -> tf.placeholder or TfInput a function that takes a name and creates a placeholder of input with that name q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. num_actions: int number of actions reuse: bool whether or not to reuse the graph variables optimizer: tf.train.Optimizer optimizer to use for the Q-learning objective. grad_norm_clipping: float or None clip gradient norms to this value. If None no clipping is performed. gamma: float discount rate. double_q: bool if true will use Double Q Learning (https://arxiv.org/abs/1509.06461). In general it is a good idea to keep it enabled. scope: str or VariableScope optional scope for variable_scope. reuse: bool or None whether or not the variables should be reused. To be able to reuse the scope must be given. Returns ------- act: (tf.Variable, bool, float) -> tf.Variable function to select and action given observation. ` See the top of the file for details. train: (object, np.array, np.array, object, np.array, np.array) -> np.array optimize the error in Bellman's equation. ` See the top of the file for details. update_target: () -> () copy the parameters from optimized Q function to the target Q function. ` See the top of the file for details. debug: {str: function} a bunch of functions to print debug data like q_values. """ act_f = build_act(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse) with tf.variable_scope(scope, reuse=reuse): # set up placeholders obs_t_input = U.ensure_tf_input(make_obs_ph("obs_t")) act_t_ph = tf.placeholder(tf.int32, [None], name="action") rew_t_ph = tf.placeholder(tf.float32, [None], name="reward") obs_tp1_input = U.ensure_tf_input(make_obs_ph("obs_tp1")) done_mask_ph = tf.placeholder(tf.float32, [None], name="done") importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight") # q network evaluation q_t = q_func(obs_t_input.get(), num_actions, scope="q_func", reuse=True) # reuse parameters from act q_func_vars = U.scope_vars(U.absolute_scope_name("q_func")) # target q network evalution q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func") target_q_func_vars = U.scope_vars( U.absolute_scope_name("target_q_func")) # q scores for actions which we know were selected in the given state. q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions), 1) # compute estimate of best possible value starting from state at t + 1 if double_q: q_tp1_using_online_net = q_func(obs_tp1_input.get(), num_actions, scope="q_func", reuse=True) q_tp1_best_using_online_net = tf.arg_max(q_tp1_using_online_net, 1) q_tp1_best = tf.reduce_sum( q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions), 1) else: q_tp1_best = tf.reduce_max(q_tp1, 1) q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best # compute RHS of bellman equation q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked # compute the error (potentially clipped) td_error = q_t_selected - tf.stop_gradient(q_t_selected_target) errors = U.huber_loss(td_error) weighted_error = tf.reduce_mean(importance_weights_ph * errors) # compute optimization op (potentially with gradient clipping) if grad_norm_clipping is not None: optimize_expr = U.minimize_and_clip(optimizer, weighted_error, var_list=q_func_vars, clip_val=grad_norm_clipping) else: optimize_expr = optimizer.minimize(weighted_error, var_list=q_func_vars) # update_target_fn will be called periodically to copy Q network to target Q network update_target_expr = [] for var, var_target in zip( sorted(q_func_vars, key=lambda v: v.name), sorted(target_q_func_vars, key=lambda v: v.name)): update_target_expr.append(var_target.assign(var)) update_target_expr = tf.group(*update_target_expr) # Create callable functions train = U.function(inputs=[ obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph, importance_weights_ph ], outputs=td_error, updates=[optimize_expr]) update_target = U.function([], [], updates=[update_target_expr]) q_values = U.function([obs_t_input], q_t) return act_f, train, update_target, {'q_values': q_values}
def build_train_deictic_min_streamlined(make_obs_ph, q_func, num_actions, batch_size, num_deictic_patches, max_num_groups, optimizer, gamma=1.0, grad_norm_clipping=None, double_q=True, scope="deepq", reuse=None): # act_f = build_act(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse) getq_f = build_getq(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse) with tf.variable_scope(scope, reuse=reuse): # set up placeholders obs_t_input = U.ensure_tf_input(make_obs_ph("obs_t")) act_t_ph = tf.placeholder(tf.int32, [None], name="action") obs_tp1_input = U.ensure_tf_input(make_obs_ph("obs_tp1")) rew_t_ph = tf.placeholder(tf.float32, [None], name="reward") group_matching_ph = tf.placeholder(tf.int32, [None], name="group_matching") done_mask_ph = tf.placeholder(tf.float32, [None], name="done_mask") # Creating this placeholder to enable a tabular version of this code q_tp1_ph = tf.placeholder(tf.float32, [None, 4], name="q_tp1") # q network evaluation q_t = q_func(obs_t_input.get(), num_actions, scope="q_func", reuse=True) # reuse parameters from act q_func_vars = U.scope_vars(U.absolute_scope_name("q_func")) # target q network evalution # q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func") q_tp1 = q_tp1_ph target_q_func_vars = U.scope_vars( U.absolute_scope_name("target_q_func")) # compute estimate of best possible value starting from state at t + 1 if double_q: q_tp1_using_online_net = q_func(obs_tp1_input.get(), num_actions, scope="q_func", reuse=True) q_tp1_best_using_online_net = tf.arg_max(q_tp1_using_online_net, 1) q_tp1_best = tf.reduce_sum( q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions), 1) else: q_tp1_maxa = tf.reduce_max(q_tp1, 1) # Calculate target = max_{a,d} Q(d,a). Size should be: Bx1 q_tp1_maxa_reshape = tf.reshape(q_tp1_maxa, [batch_size, num_deictic_patches]) q_tp1_target = tf.reduce_max(q_tp1_maxa_reshape, 1) q_tp1_target = rew_t_ph + gamma * q_tp1_target q_tp1_target = (1.0 - done_mask_ph) * q_tp1_target # Calculate desc_2_state. Encodes which descriptors contained in each state. # Dimensions should be: B x max_num_groups group_matching_3dtensor = tf.reshape(group_matching_ph, [batch_size, num_deictic_patches]) groups_onehot = tf.one_hot(group_matching_3dtensor, max_num_groups, axis=-1) desc_2_state = tf.reduce_max(groups_onehot, 1) # Calculate target_min_per_D max_target = tf.reduce_max(q_tp1_target) q_tp1_target_tiled = tf.tile(tf.reshape(q_tp1_target, [batch_size, 1]), [1, max_num_groups]) target_min_per_D_expanded = desc_2_state * q_tp1_target_tiled + ( 1 - desc_2_state) * max_target target_min_per_D = tf.reduce_min(target_min_per_D_expanded, 0) # Project target_min_per_D onto q_t D_2_DI = tf.one_hot(group_matching_ph, max_num_groups) # Both the next two lines produce the same results targets = tf.squeeze( tf.matmul(D_2_DI, tf.reshape(target_min_per_D, [max_num_groups, 1]))) # targets2 = tf.reduce_sum(D_2_DI * tf.tile(tf.reshape(target_min_per_D,[1,max_num_groups]),[batch_size*num_deictic_patches,1]),1) # tf.tile(tf.reshape(target_min_per_D,[max_num_groups,1]),[1,np.shape(q_t)[0]]) # # expand target to BxD(I') # q_tp1_target_under_tiled = tf.tile(tf.reshape(q_tp1_target_under,[batch_size,1]),[1,num_deictic_patches]) # q_tp1_target = tf.reshape(q_tp1_target_under_tiled,[batch_size*num_deictic_patches,1]) # # Calculate min over groups # groups_onehot = tf.one_hot(group_matching_ph,max_num_groups+1) # q_tp1_target_tiled = tf.tile(q_tp1_target,[1,max_num_groups+1]) # max_target = tf.reduce_max(q_tp1_target) # groups_target_tiled = groups_onehot*q_tp1_target_tiled + (1-groups_onehot)*max_target # groups_target = tf.reduce_min(groups_target_tiled,0)[:-1] # groups_target_reduced = groups_target[0:tf.minimum(tf.shape(group_actions_ph)[0],tf.shape(groups_target)[0])] # q scores for actions which we know were selected in the given state. q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions), 1) # compute the error (potentially clipped) # td_error = q_group_selected - tf.stop_gradient(groups_target_reduced) td_error = q_t_selected - tf.stop_gradient(targets) errors = U.huber_loss(td_error) weighted_error = errors # compute optimization op (potentially with gradient clipping) if grad_norm_clipping is not None: optimize_expr = U.minimize_and_clip(optimizer, weighted_error, var_list=q_func_vars, clip_val=grad_norm_clipping) else: optimize_expr = optimizer.minimize(weighted_error, var_list=q_func_vars) # update_target_fn will be called periodically to copy Q network to target Q network update_target_expr = [] for var, var_target in zip( sorted(q_func_vars, key=lambda v: v.name), sorted(target_q_func_vars, key=lambda v: v.name)): update_target_expr.append(var_target.assign(var)) update_target_expr = tf.group(*update_target_expr) # Create callable functions train = U.function(inputs=[ obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, group_matching_ph, done_mask_ph ], outputs=[targets], updates=[optimize_expr]) # Create callable functions trainWOUpdate = U.function(inputs=[ obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, group_matching_ph, done_mask_ph, q_tp1_ph ], outputs=[ q_tp1_target, desc_2_state, target_min_per_D, D_2_DI, targets ]) update_target = U.function([], [], updates=[update_target_expr]) # q_values = U.function([obs_t_input], q_t) # return getq_f, train, trainWOUpdate, update_target, {'q_values': q_values} return getq_f, train, trainWOUpdate, update_target