def __init__(self, obs_space, action_space, num_outputs, model_config, name): TFModelV2.__init__(self, obs_space, action_space, num_outputs, model_config, name) self.legacy_model_cls = legacy_model_cls # Tracks the last v1 model created by the call to forward self.cur_instance = None # XXX: Try to guess the initial state size. Since the size of the # state is known only after forward() for V1 models, it might be # wrong. if model_config.get("state_shape"): self.initial_state = [ np.zeros(s, np.float32) for s in model_config["state_shape"] ] elif model_config.get("use_lstm"): cell_size = model_config.get("lstm_cell_size", 256) self.initial_state = [ np.zeros(cell_size, np.float32), np.zeros(cell_size, np.float32), ] else: self.initial_state = [] # Tracks update ops self._update_ops = None with tf.variable_scope(self.name) as scope: self.variable_scope = scope
def __init__(self, obs_space, action_space, num_outputs, model_config, name): TFModelV2.__init__(self, obs_space, action_space, num_outputs, model_config, name) alpha = model_config.get("alpha", 1) lambda_ = model_config.get("lambda_", 1) self.feature_dim = obs_space.sample().size self.arms = [ OnlineLinearRegression(feature_dim=self.feature_dim, alpha=alpha, lambda_=lambda_) for i in range(self.num_outputs) ] self._cur_value = None self._cur_ctx = None
def __init__(self, obs_space, action_space, num_outputs, model_config, name): TFModelV2.__init__(self, obs_space, action_space, num_outputs, model_config, name) alpha = model_config.get("alpha", 1) lambda_ = model_config.get("lambda_", 0.1) # RLlib preprocessors will flatten the observation space and unflatten # it later. Accessing the original space here. original_space = obs_space.original_space assert ( isinstance(original_space, gym.spaces.Dict) and "item" in original_space.spaces ), "This model only supports gym.spaces.Dict observation spaces." self.feature_dim = original_space["item"].shape[-1] self.arm = OnlineLinearRegression(feature_dim=self.feature_dim, alpha=alpha, lambda_=lambda_) self._cur_value = None self._cur_ctx = None
def __init__(self, obs_space, action_space, num_outputs, model_config, name): """Initialize a TFModelV2. Here is an example implementation for a subclass ``MyRNNClass(RecurrentTFModelV2)``:: def __init__(self, *args, **kwargs): super(MyModelClass, self).__init__(*args, **kwargs) cell_size = 256 # Define input layers input_layer = tf.keras.layers.Input( shape=(None, obs_space.shape[0])) state_in_h = tf.keras.layers.Input(shape=(256, )) state_in_c = tf.keras.layers.Input(shape=(256, )) seq_in = tf.keras.layers.Input(shape=()) # Send to LSTM cell lstm_out, state_h, state_c = tf.keras.layers.LSTM( cell_size, return_sequences=True, return_state=True, name="lstm")( inputs=input_layer, mask=tf.sequence_mask(seq_in), initial_state=[state_in_h, state_in_c]) output_layer = tf.keras.layers.Dense(...)(lstm_out) # Create the RNN model self.rnn_model = tf.keras.Model( inputs=[input_layer, seq_in, state_in_h, state_in_c], outputs=[output_layer, state_h, state_c]) self.register_variables(self.rnn_model.variables) self.rnn_model.summary() """ TFModelV2.__init__(self, obs_space, action_space, num_outputs, model_config, name)
def __init__(self, obs_space, action_space, num_outputs, model_config, name): model_config = with_base_config(base_config=DEFAULT_STRATEGO_MODEL_CONFIG, extra_config=model_config) TFModelV2.__init__(self, obs_space, action_space, num_outputs, model_config, name) print(model_config) observation_mode = model_config['custom_options']['observation_mode'] if observation_mode == PARTIALLY_OBSERVABLE: self.pi_obs_key = 'partial_observation' self.vf_obs_key = 'partial_observation' elif observation_mode == FULLY_OBSERVABLE: self.pi_obs_key = 'full_observation' self.vf_obs_key = 'full_observation' elif observation_mode == BOTH_OBSERVATIONS: self.pi_obs_key = 'partial_observation' self.vf_obs_key = 'full_observation' assert not model_config['vf_share_layers'] else: assert False, "policy observation_mode must be in [PARTIALLY_OBSERVABLE, FULLY_OBSERVABLE, BOTH_OBSERVATIONS]" if model_config["custom_preprocessor"]: print(obs_space) self.preprocessor = ModelCatalog.get_preprocessor_for_space(observation_space=self.obs_space.original_space, options=model_config) else: self.preprocessor = None logger.warn("No custom preprocessor for StrategoModel was specified.\n" "Some tree search policies may not initialize their placeholders correctly without this.") self.use_lstm = model_config['use_lstm'] self.lstm_cell_size = model_config['lstm_cell_size'] self.vf_share_layers = model_config.get("vf_share_layers") self.mask_invalid_actions = model_config['custom_options']['mask_invalid_actions'] conv_activation = get_activation_fn(model_config.get("conv_activation")) cnn_filters = model_config.get("conv_filters") fc_activation = get_activation_fn(model_config.get("fcnet_activation")) hiddens = model_config.get("fcnet_hiddens") if self.use_lstm: state_in = [tf.keras.layers.Input(shape=(self.lstm_cell_size,), name="pi_lstm_h"), tf.keras.layers.Input(shape=(self.lstm_cell_size,), name="pi_lstm_c"), tf.keras.layers.Input(shape=(self.lstm_cell_size,), name="vf_lstm_h"), tf.keras.layers.Input(shape=(self.lstm_cell_size,), name="vf_lstm_c")] seq_lens_in = tf.keras.layers.Input(shape=(), name="lstm_seq_in") self.pi_obs_inputs = tf.keras.layers.Input( shape=(None, *obs_space.original_space[self.pi_obs_key].shape), name="pi_observation") self.vf_obs_inputs = tf.keras.layers.Input( shape=(None, *obs_space.original_space[self.vf_obs_key].shape), name="vf_observation") else: state_in, seq_lens_in = None, None self.pi_obs_inputs = tf.keras.layers.Input( shape=obs_space.original_space[self.pi_obs_key].shape, name="pi_observation") self.vf_obs_inputs = tf.keras.layers.Input( shape=obs_space.original_space[self.vf_obs_key].shape, name="vf_observation") if cnn_filters is None: # assuming board size will always remain the same for both pi and vf networks if self.use_lstm: single_obs_input_shape = self.pi_obs_inputs.shape.as_list()[2:] else: single_obs_input_shape = self.pi_obs_inputs.shape.as_list()[1:] cnn_filters = _get_filter_config(single_obs_input_shape) def maybe_td(layer): if self.use_lstm: return tf.keras.layers.TimeDistributed(layer=layer) else: return layer def build_primary_layers(prefix: str, obs_in: tf.Tensor, state_in: tf.Tensor): # encapsulated in a function to either be called once for shared policy/vf or twice for separate policy/vf _last_layer = obs_in for i, (out_size, kernel, stride) in enumerate(cnn_filters): _last_layer = maybe_td(tf.keras.layers.Conv2D( filters=out_size, kernel_size=kernel, strides=stride, activation=conv_activation, padding="same", name="{}_conv_{}".format(prefix, i)))(_last_layer) _last_layer = maybe_td(tf.keras.layers.Flatten())(_last_layer) for i, size in enumerate(hiddens): _last_layer = maybe_td(tf.keras.layers.Dense( size, name="{}_fc_{}".format(prefix, i), activation=fc_activation, kernel_initializer=normc_initializer(1.0)))(_last_layer) if self.use_lstm: _last_layer, *state_out = tf.keras.layers.LSTM( units=self.lstm_cell_size, return_sequences=True, return_state=True, name="{}_lstm".format(prefix))( inputs=_last_layer, mask=tf.sequence_mask(seq_lens_in), initial_state=state_in) else: state_out = None return _last_layer, state_out if self.use_lstm: pi_state_in = state_in[:2] vf_state_in = state_in[2:] else: pi_state_in, vf_state_in = None, None policy_file_path = None if 'policy_keras_model_file_path' in model_config['custom_options']: policy_file_path = model_config['custom_options']['policy_keras_model_file_path'] if policy_file_path is not None: if self.use_lstm: raise NotImplementedError pi_state_out = None self._pi_model = load_model(filepath=policy_file_path, compile=False) # remove loaded input layer # pi_model.layers.pop(0) # self.pi_obs_inputs = pi_model.layers[0] # rename layers for layer in self._pi_model.layers: layer._name = "pi_" + layer.name self._pi_model.layers[-1]._name = 'pi_unmasked_logits' self.unmasked_logits_out = self._pi_model(self.pi_obs_inputs) else: self._pi_model = None pi_last_layer, pi_state_out = build_primary_layers(prefix="pi", obs_in=self.pi_obs_inputs, state_in=pi_state_in) self.unmasked_logits_out = maybe_td(tf.keras.layers.Dense( num_outputs, name="pi_unmasked_logits", activation=None, kernel_initializer=normc_initializer(0.01)))(pi_last_layer) vf_last_layer, vf_state_out = build_primary_layers(prefix="vf", obs_in=self.vf_obs_inputs, state_in=vf_state_in) if self.use_lstm: state_out = [*pi_state_out, *vf_state_out] else: state_out = None self._use_q_fn = model_config['custom_options']['q_fn'] if self._use_q_fn: value_out_size = num_outputs else: value_out_size = 1 value_out = maybe_td(tf.keras.layers.Dense( value_out_size, name="vf_out", activation=None, kernel_initializer=normc_initializer(0.01)))(vf_last_layer) model_inputs = [self.pi_obs_inputs, self.vf_obs_inputs] model_outputs = [self.unmasked_logits_out, value_out] if self.use_lstm: model_inputs += [seq_lens_in, *state_in] model_outputs += state_out self.base_model = tf.keras.Model(inputs=model_inputs, outputs=model_outputs) print(self.base_model.summary()) self.register_variables(self.base_model.variables)
def __init__(self, obs_space=None, action_space=None, num_outputs=35, model_config={}, name='my_model'): self.base_model: Optional[keras.Model] = None self.keras_eval_model: Optional[keras.Model] = None self.keras_model_predict_function: Optional[ K.GraphExecutionFunction] = None self.training_status: ModelTrainingStatus = ModelTrainingStatus() self._checkpoint: Optional[tf.train.Checkpoint] = None self._checkpoint_manager: Optional[tf.train.CheckpointManager] = None TFModelV2.__init__(self, obs_space, action_space, num_outputs, model_config, name) config = Config(set_defaults=True, load_from_args=True, verify=True) Code2VecModelBase.__init__(self, config) # def _create_keras_model(self): import tensorflow as tf from tensorflow import keras from tensorflow.keras.layers import Input, Embedding, Concatenate, Dropout, TimeDistributed, Dense from tensorflow.keras.callbacks import Callback import tensorflow.keras.backend as K from tensorflow.keras.metrics import sparse_top_k_categorical_accuracy # Each input sample consists of a bag of x`MAX_CONTEXTS` tuples (source_terminal, path, target_terminal). # The valid mask indicates for each context whether it actually exists or it is just a padding. path_source_token_input = Input((self.config.MAX_CONTEXTS, ), dtype=tf.int32) path_input = Input((self.config.MAX_CONTEXTS, ), dtype=tf.int32) path_target_token_input = Input((self.config.MAX_CONTEXTS, ), dtype=tf.int32) context_valid_mask = Input((self.config.MAX_CONTEXTS, )) # Input paths are indexes, we embed these here. paths_embedded = Embedding(self.vocabs.path_vocab.size, self.config.PATH_EMBEDDINGS_SIZE, name='path_embedding')(path_input) # Input terminals are indexes, we embed these here. token_embedding_shared_layer = Embedding( self.vocabs.token_vocab.size, self.config.TOKEN_EMBEDDINGS_SIZE, name='token_embedding') path_source_token_embedded = token_embedding_shared_layer( path_source_token_input) path_target_token_embedded = token_embedding_shared_layer( path_target_token_input) # `Context` is a concatenation of the 2 terminals & path embedding. # Each context is a vector of size 3 * EMBEDDINGS_SIZE. context_embedded = Concatenate()([ path_source_token_embedded, paths_embedded, path_target_token_embedded ]) context_embedded = Dropout(1 - self.config.DROPOUT_KEEP_RATE)( context_embedded) # Lets get dense: Apply a dense layer for each context vector (using same weights for all of the context). context_after_dense = TimeDistributed( Dense(self.config.CODE_VECTOR_SIZE, use_bias=False, activation='tanh'))(context_embedded) # The final code vectors are received by applying attention to the "densed" context vectors. code_vectors, attention_weights = AttentionLayer(name='attention')( [context_after_dense, context_valid_mask]) # "Decode": Now we use another dense layer to get the target word embedding from each code vector. #target_index = Dense( # self.vocabs.target_vocab.size, use_bias=False, activation='softmax', name='target_index')(code_vectors) target_index = Dense(num_outputs, use_bias=False, activation='softmax', name='target_index')(code_vectors) value_out = Dense(1, activation=None, name='value_out')(code_vectors) # Wrap the layers into a Keras model, using our subtoken-metrics and the CE loss. inputs = [ path_source_token_input, path_input, path_target_token_input, context_valid_mask ] self.base_model = keras.Model(inputs=inputs, outputs=[target_index, value_out]) self.register_variables(self.base_model.variables)
def __init__(self, obs_space, action_space, num_outputs, model_config, name): model_config = with_base_config( base_config=DEFAULT_STRATEGO_MODEL_CONFIG, extra_config=model_config) TFModelV2.__init__(self, obs_space, action_space, num_outputs, model_config, name) print(model_config) observation_mode = model_config['custom_options']['observation_mode'] if observation_mode == PARTIALLY_OBSERVABLE: self.pi_obs_key = 'partial_observation' self.vf_obs_key = 'partial_observation' elif observation_mode == FULLY_OBSERVABLE: self.pi_obs_key = 'full_observation' self.vf_obs_key = 'full_observation' elif observation_mode == BOTH_OBSERVATIONS: self.pi_obs_key = 'partial_observation' self.vf_obs_key = 'full_observation' assert not model_config['vf_share_layers'] else: assert False, "policy observation_mode must be in [PARTIALLY_OBSERVABLE, FULLY_OBSERVABLE, BOTH_OBSERVATIONS]" if model_config["custom_preprocessor"]: print(obs_space) self.preprocessor = ModelCatalog.get_preprocessor_for_space( observation_space=self.obs_space.original_space, options=model_config) else: self.preprocessor = None logger.warn( "No custom preprocessor for StrategoModel was specified.\n" "Some tree search policies may not initialize their placeholders correctly without this." ) self.use_lstm = model_config['use_lstm'] self.fake_lstm = model_config['custom_options'].get('fake_lstm') self.vf_share_layers = model_config.get("vf_share_layers") self.mask_invalid_actions = model_config['custom_options'][ 'mask_invalid_actions'] conv_activation = get_activation_fn( model_config.get("conv_activation")) lstm_filters = model_config["custom_options"]['lstm_filters'] cnn_filters = model_config.get("conv_filters") final_pi_filter_amt = model_config["custom_options"][ "final_pi_filter_amt"] rows = obs_space.original_space[self.pi_obs_key].shape[0] colums = obs_space.original_space[self.pi_obs_key].shape[1] if self.use_lstm: if self.fake_lstm: self._lstm_state_shape = (1, ) else: self._lstm_state_shape = (rows, colums, lstm_filters[0][0]) if self.use_lstm: state_in = [ tf.keras.layers.Input(shape=self._lstm_state_shape, name="pi_lstm_h"), tf.keras.layers.Input(shape=self._lstm_state_shape, name="pi_lstm_c"), tf.keras.layers.Input(shape=self._lstm_state_shape, name="vf_lstm_h"), tf.keras.layers.Input(shape=self._lstm_state_shape, name="vf_lstm_c") ] seq_lens_in = tf.keras.layers.Input(shape=(), name="lstm_seq_in") self.pi_obs_inputs = tf.keras.layers.Input( shape=(None, *obs_space.original_space[self.pi_obs_key].shape), name="pi_observation") self.vf_obs_inputs = tf.keras.layers.Input( shape=(None, *obs_space.original_space[self.vf_obs_key].shape), name="vf_observation") else: state_in, seq_lens_in = None, None self.pi_obs_inputs = tf.keras.layers.Input( shape=obs_space.original_space[self.pi_obs_key].shape, name="pi_observation") self.vf_obs_inputs = tf.keras.layers.Input( shape=obs_space.original_space[self.vf_obs_key].shape, name="vf_observation") # if pi_cnn_filters is None: # assert False # # assuming board size will always remain the same for both pi and vf networks # if self.use_lstm: # single_obs_input_shape = self.pi_obs_inputs.shape.as_list()[2:] # else: # single_obs_input_shape = self.pi_obs_inputs.shape.as_list()[1:] # pi_cnn_filters = _get_filter_config(single_obs_input_shape) # # if v_cnn_filters is None: # assert False # # assuming board size will always remain the same for both pi and vf networks # if self.use_lstm: # single_obs_input_shape = self.pi_obs_inputs.shape.as_list()[2:] # else: # single_obs_input_shape = self.pi_obs_inputs.shape.as_list()[1:] # v_cnn_filters = _get_filter_config(single_obs_input_shape) def maybe_td(layer): if self.use_lstm: return tf.keras.layers.TimeDistributed(layer=layer, name=f"td_{layer.name}") else: return layer def build_primary_layers(prefix: str, obs_in: tf.Tensor, state_in: tf.Tensor): # encapsulated in a function to either be called once for shared policy/vf or twice for separate policy/vf _last_layer = obs_in for i, (out_size, kernel, stride) in enumerate(cnn_filters): _last_layer = maybe_td( tf.keras.layers.Conv2D(filters=out_size, kernel_size=kernel, strides=stride, activation=conv_activation, padding="same", name="{}_conv_{}".format( prefix, i)))(_last_layer) state_out = state_in if self.use_lstm and not self.fake_lstm: for i, (out_size, kernel, stride) in enumerate(lstm_filters): if i > 0: raise NotImplementedError( "Only single lstm layers are implemented right now" ) _last_layer, *state_out = tf.keras.layers.ConvLSTM2D( filters=out_size, kernel_size=kernel, strides=stride, activation=conv_activation, padding="same", return_sequences=True, return_state=True, name="{}_convlstm".format(prefix))( inputs=_last_layer, mask=tf.sequence_mask(seq_lens_in), initial_state=state_in) # state_out = state_in # if self.use_lstm: # _last_layer = maybe_td(tf.keras.layers.Flatten())(_last_layer) # _last_layer, *state_out = tf.keras.layers.LSTM( # units=64, # return_sequences=True, # return_state=True, # name="{}_lstm".format(prefix))( # inputs=_last_layer, # mask=tf.sequence_mask(seq_lens_in), # initial_state=state_in) return _last_layer, state_out if self.use_lstm: pi_state_in = state_in[:2] vf_state_in = state_in[2:] else: pi_state_in, vf_state_in = None, None pi_last_layer, pi_state_out = build_primary_layers( prefix="pi", obs_in=self.pi_obs_inputs, state_in=pi_state_in) vf_last_layer, vf_state_out = build_primary_layers( prefix="vf", obs_in=self.vf_obs_inputs, state_in=vf_state_in) if self.use_lstm: state_out = [*pi_state_out, *vf_state_out] else: state_out = None pi_last_layer = maybe_td( tf.keras.layers.Conv2D(filters=final_pi_filter_amt, kernel_size=[3, 3], strides=1, activation=conv_activation, padding="same", name="{}_conv_{}".format( 'pi', "last")))(pi_last_layer) print( f"action space n: {action_space.n}, rows: {rows}, columns: {colums}, filters: {int(action_space.n / (rows * colums))}" ) unmasked_logits_out = maybe_td( tf.keras.layers.Conv2D( filters=int(action_space.n / (rows * colums)), kernel_size=[3, 3], strides=1, activation=None, padding="same", name="{}_conv_{}".format('pi', "unmasked_logits")))(pi_last_layer) # pi_last_layer = maybe_td(tf.keras.layers.Flatten(name="pi_flatten"))(pi_last_layer) # unmasked_logits_out = maybe_td(tf.keras.layers.Dense( # units=9, # name="pi_unmasked_logits_out", # activation=None, # kernel_initializer=normc_initializer(0.01)))(pi_last_layer) # unmasked_logits_out = maybe_td(tf.keras.layers.Reshape(target_shape=[3,3,1]))(unmasked_logits_out) self._use_q_fn = model_config['custom_options']['q_fn'] if self._use_q_fn: vf_last_layer = maybe_td( tf.keras.layers.Conv2D(filters=final_pi_filter_amt, kernel_size=[3, 3], strides=1, activation=conv_activation, padding="same", name="{}_conv_{}".format( 'vf', "last")))(vf_last_layer) value_out = maybe_td( tf.keras.layers.Conv2D( filters=int(action_space.n / (rows * colums)), kernel_size=[3, 3], strides=1, activation=None, padding="same", name="{}_conv_{}".format('vf', "q_out")))(vf_last_layer) else: vf_last_layer = maybe_td( tf.keras.layers.Conv2D(filters=1, kernel_size=[1, 1], strides=1, activation=conv_activation, padding="same", name="{}_conv_{}".format( 'vf', "last")))(vf_last_layer) vf_last_layer = maybe_td( tf.keras.layers.Flatten(name="vf_flatten"))(vf_last_layer) value_out = maybe_td( tf.keras.layers.Dense( units=1, name="vf_out", activation=None, kernel_initializer=normc_initializer(0.01)))(vf_last_layer) model_inputs = [self.pi_obs_inputs, self.vf_obs_inputs] model_outputs = [unmasked_logits_out, value_out] if self.use_lstm: model_inputs += [seq_lens_in, *state_in] model_outputs += state_out self.base_model = tf.keras.Model(inputs=model_inputs, outputs=model_outputs) print(self.base_model.summary()) self.register_variables(self.base_model.variables)
def __init__(self, obs_space, action_space, num_outputs, model_config, name): TFModelV2.__init__(self, obs_space, action_space, num_outputs, model_config, name) self.legacy_model_cls = legacy_model_cls def instance_template(input_dict, state, seq_lens): # create a new model instance with tf.variable_scope(self.name): new_instance = self.legacy_model_cls( input_dict, obs_space, action_space, num_outputs, model_config, state, seq_lens) return new_instance self.instance_template = tf.make_template("instance_template", instance_template) # Tracks the last v1 model created by the call to forward self.cur_instance = None def vf_template(last_layer, input_dict): with tf.variable_scope(self.variable_scope): with tf.variable_scope("value_function"): # Simple case: sharing the feature layer if model_config["vf_share_layers"]: return tf.reshape( linear(last_layer, 1, "value_function", normc_initializer(1.0)), [-1]) # Create a new separate model with no RNN state, etc. branch_model_config = model_config.copy() branch_model_config["free_log_std"] = False if branch_model_config["use_lstm"]: branch_model_config["use_lstm"] = False logger.warning( "It is not recommended to use a LSTM model " "with vf_share_layers=False (consider " "setting it to True). If you want to not " "share layers, you can implement a custom " "LSTM model that overrides the " "value_function() method.") branch_instance = legacy_model_cls( input_dict, obs_space, action_space, 1, branch_model_config, state_in=None, seq_lens=None) return tf.reshape(branch_instance.outputs, [-1]) self.vf_template = tf.make_template("vf_template", vf_template) # XXX: Try to guess the initial state size. Since the size of the # state is known only after forward() for V1 models, it might be # wrong. if model_config.get("state_shape"): self.initial_state = [ np.zeros(s, np.float32) for s in model_config["state_shape"] ] elif model_config.get("use_lstm"): cell_size = model_config.get("lstm_cell_size", 256) self.initial_state = [ np.zeros(cell_size, np.float32), np.zeros(cell_size, np.float32), ] else: self.initial_state = [] with tf.variable_scope(self.name) as scope: self.variable_scope = scope
def __init__(self, obs_space, action_space, num_outputs, model_config, name): TFModelV2.__init__(self, obs_space, action_space, num_outputs, model_config, name)
def __init__(self, obs_space, action_space, num_outputs, model_config, name): model_config = with_base_config( base_config=DEFAULT_STRATEGO_MODEL_CONFIG, extra_config=model_config) TFModelV2.__init__(self, obs_space, action_space, num_outputs, model_config, name) print(model_config) observation_mode = model_config['custom_options']['observation_mode'] if observation_mode == PARTIALLY_OBSERVABLE: self._obs_key = 'partial_observation' elif observation_mode == FULLY_OBSERVABLE: self._obs_key = 'full_observation' elif observation_mode == BOTH_OBSERVATIONS: raise NotImplementedError else: assert False, "policy observation_mode must be in [PARTIALLY_OBSERVABLE, FULLY_OBSERVABLE, BOTH_OBSERVATIONS]" self._action_dist_class, self._logit_dim = ModelCatalog.get_action_dist( self.action_space, model_config) self.use_lstm = model_config['use_lstm'] self.fake_lstm = model_config['custom_options'].get('fake_lstm', False) self.mask_invalid_actions = model_config['custom_options'][ 'mask_invalid_actions'] conv_activation = get_activation_fn( model_config.get("conv_activation")) base_lstm_filters = model_config["custom_options"]['base_lstm_filters'] base_cnn_filters = model_config["custom_options"]['base_cnn_filters'] pi_cnn_filters = model_config["custom_options"]['pi_cnn_filters'] q_cnn_filters = model_config["custom_options"]['q_cnn_filters'] rows = obs_space.original_space[self._obs_key].shape[0] colums = obs_space.original_space[self._obs_key].shape[1] if self.use_lstm: self._lstm_state_shape = (rows, colums, base_lstm_filters[0][0]) if self.use_lstm and not self.fake_lstm: self._base_model_out_shape = (rows, colums, base_lstm_filters[0][0]) else: self._base_model_out_shape = (rows, colums, base_cnn_filters[-1][0]) if self.use_lstm: state_in = [ tf.keras.layers.Input(shape=self._lstm_state_shape, name="base_lstm_h"), tf.keras.layers.Input(shape=self._lstm_state_shape, name="base_lstm_c") ] seq_lens_in = tf.keras.layers.Input(shape=(), name="lstm_seq_in") self._obs_inputs = tf.keras.layers.Input( shape=(None, *obs_space.original_space[self._obs_key].shape), name="observation") self._base_model_out = tf.keras.layers.Input( shape=self._base_model_out_shape, name="model_out") else: state_in, seq_lens_in = None, None self._obs_inputs = tf.keras.layers.Input( shape=obs_space.original_space[self._obs_key].shape, name="observation") self._base_model_out = tf.keras.layers.Input( shape=self._base_model_out_shape, name="model_out") def maybe_td(layer): if self.use_lstm: return tf.keras.layers.TimeDistributed(layer=layer, name=f"td_{layer.name}") else: return layer def build_shared_base_layers(prefix: str, obs_in: tf.Tensor, state_in: tf.Tensor): # obs_in = tf.debugging.check_numerics( # obs_in, f"nan found in obs_in", name=None) _last_layer = obs_in for i, (out_size, kernel, stride) in enumerate(base_cnn_filters): _last_layer = maybe_td( tf.keras.layers.Conv2D(filters=out_size, kernel_size=kernel, strides=stride, activation=conv_activation, padding="same", name="{}_conv_{}".format( prefix, i)))(_last_layer) # _last_layer = tf.debugging.check_numerics( # _last_layer, f"nan found in _last_layer {i}", name=None) base_state_out = state_in if self.use_lstm and not self.fake_lstm: for i, (out_size, kernel, stride) in enumerate(base_lstm_filters): if i > 0: raise NotImplementedError( "Only single lstm layers are implemented right now" ) _last_layer, *base_state_out = tf.keras.layers.ConvLSTM2D( filters=out_size, kernel_size=kernel, strides=stride, activation=conv_activation, padding="same", data_format='channels_last', return_sequences=True, return_state=True, name="{}_convlstm".format(prefix))( inputs=_last_layer, initial_state=state_in, mask=tf.sequence_mask(seq_lens_in)) return _last_layer, base_state_out def build_pi_layers(input_layer): _last_layer = input_layer for i, (out_size, kernel, stride) in enumerate(pi_cnn_filters): _last_layer = tf.keras.layers.Conv2D( filters=out_size, kernel_size=kernel, strides=stride, activation=conv_activation, padding="same", name="{}_conv_{}".format('pi', i))(_last_layer) print( f"action space n: {action_space.n}, rows: {rows}, columns: {colums}, filters: {int(action_space.n / (rows * colums))}" ) unmasked_logits = tf.keras.layers.Conv2D( filters=int(action_space.n / (rows * colums)), kernel_size=[3, 3], strides=1, activation=None, padding="same", name="{}_conv_{}".format('pi', "unmasked_logits"))(_last_layer) return unmasked_logits def build_q_layers(input_layer, prefix): _last_layer = input_layer for i, (out_size, kernel, stride) in enumerate(q_cnn_filters): _last_layer = tf.keras.layers.Conv2D( filters=out_size, kernel_size=kernel, strides=stride, activation=conv_activation, padding="same", name="{}_conv_{}".format(prefix, i))(_last_layer) q_val = tf.keras.layers.Conv2D( filters=int(action_space.n / (rows * colums)), kernel_size=[3, 3], strides=1, activation=None, padding="same", name="{}_conv_{}".format(prefix, "q_out"))(_last_layer) return q_val base_model_out, state_out = build_shared_base_layers( prefix="shared_base", obs_in=self._obs_inputs, state_in=state_in) pi_unmasked_logits_out = build_pi_layers( input_layer=self._base_model_out) q1_out = build_q_layers(input_layer=self._base_model_out, prefix="q1") q2_out = build_q_layers(input_layer=self._base_model_out, prefix="q2") base_inputs = [self._obs_inputs] base_outputs = [base_model_out] if self.use_lstm: base_inputs += [seq_lens_in, *state_in] base_outputs += [*state_out] self._base_model = tf.keras.Model(name=f"{name}_base", inputs=base_inputs, outputs=base_outputs) self.pi_model = tf.keras.Model(name=f"{name}_pi_head", inputs=[self._base_model_out], outputs=[pi_unmasked_logits_out]) self.q1_model = tf.keras.Model(name=f"{name}_q1_head", inputs=[self._base_model_out], outputs=[q1_out]) self.q2_model = tf.keras.Model(name=f"{name}_q2_head", inputs=[self._base_model_out], outputs=[q2_out]) print(self._base_model.summary()) print(self.pi_model.summary()) print(self.q1_model.summary()) print(self.q2_model.summary()) self.register_variables(self._base_model.variables) self.register_variables(self.pi_model.variables) self.register_variables(self.q1_model.variables) self.register_variables(self.q2_model.variables) self.log_alpha = tf.Variable(0.0, dtype=tf.float32, name="log_alpha") self.alpha = tf.exp(self.log_alpha) self.register_variables([self.log_alpha])
def __init__(self, obs_space, action_space, num_outputs, model_config, name, twin_q): model_config = with_base_config( base_config=DEFAULT_STRATEGO_MODEL_CONFIG, extra_config=model_config) TFModelV2.__init__(self, obs_space, action_space, num_outputs, model_config, name) print(model_config) observation_mode = model_config['custom_options']['observation_mode'] if observation_mode == PARTIALLY_OBSERVABLE: self.pi_obs_key = 'partial_observation' self.vf_obs_key = 'partial_observation' elif observation_mode == FULLY_OBSERVABLE: self.pi_obs_key = 'full_observation' self.vf_obs_key = 'full_observation' elif observation_mode == BOTH_OBSERVATIONS: self.pi_obs_key = 'partial_observation' self.vf_obs_key = 'full_observation' assert not model_config['vf_share_layers'] else: assert False, "policy observation_mode must be in [PARTIALLY_OBSERVABLE, FULLY_OBSERVABLE, BOTH_OBSERVATIONS]" if model_config["custom_preprocessor"]: print(obs_space) self.preprocessor = ModelCatalog.get_preprocessor_for_space( observation_space=self.obs_space.original_space, options=model_config) else: self.preprocessor = None logger.warn( "No custom preprocessor for StrategoModel was specified.\n" "Some tree search policies may not initialize their placeholders correctly without this." ) self.use_lstm = model_config['use_lstm'] if self.use_lstm: raise NotImplementedError self.fake_lstm = model_config['custom_options'].get('fake_lstm', False) self.vf_share_layers = model_config.get("vf_share_layers") self.mask_invalid_actions = model_config['custom_options'][ 'mask_invalid_actions'] self._use_q_fn = model_config['custom_options']['q_fn'] self.twin_q = twin_q assert not (not self._use_q_fn and self.twin_q) if self.twin_q and self.use_lstm: raise NotImplementedError self._sac_alpha = model_config.get("sac_alpha", False) conv_activation = get_activation_fn( model_config.get("conv_activation")) if self.use_lstm: raise NotImplementedError else: state_in, seq_lens_in = None, None self.pi_obs_inputs = tf.keras.layers.Input( shape=obs_space.original_space[self.pi_obs_key].shape, name="pi_observation") self.vf_obs_inputs = tf.keras.layers.Input( shape=obs_space.original_space[self.vf_obs_key].shape, name="vf_observation") def maybe_td(layer): if self.use_lstm: return tf.keras.layers.TimeDistributed(layer=layer, name=f"td_{layer.name}") else: return layer def build_primary_layers(prefix: str, obs_in: tf.Tensor, state_in: tf.Tensor): # encapsulated in a function to either be called once for shared policy/vf or twice for separate policy/vf _last_layer = obs_in state_out = state_in for i, size in enumerate(model_config['fcnet_hiddens']): _last_layer = maybe_td( tf.keras.layers.Dense(size, name="{}_fc_{}".format(prefix, i), activation=conv_activation, kernel_initializer=normc_initializer( 1.0)))(_last_layer) return _last_layer, state_out if self.use_lstm: pi_state_in = state_in[:2] vf_state_in = state_in[2:] else: pi_state_in, vf_state_in = None, None self.main_vf_prefix = "main_vf" if self.twin_q else "vf" pi_last_layer, pi_state_out = build_primary_layers( prefix="pi", obs_in=self.pi_obs_inputs, state_in=pi_state_in) vf_last_layer, vf_state_out = build_primary_layers( prefix=self.main_vf_prefix, obs_in=self.vf_obs_inputs, state_in=vf_state_in) if self.twin_q: twin_vf_last_layer, twin_vf_state_out = build_primary_layers( prefix="twin_vf", obs_in=self.vf_obs_inputs, state_in=None) else: twin_vf_last_layer, twin_vf_state_out = None, None if self.use_lstm: raise NotImplementedError else: state_out = None unmasked_logits_out = maybe_td( tf.keras.layers.Dense( action_space.n, name="{}_fc_{}".format('pi', 'unmasked_logits'), activation=None, kernel_initializer=normc_initializer(1.0))(pi_last_layer)) value_out = maybe_td( tf.keras.layers.Dense( action_space.n, name="{}_fc_{}".format(self.main_vf_prefix, 'q_out'), activation=None, kernel_initializer=normc_initializer(1.0))(vf_last_layer)) if self.twin_q: twin_value_out = maybe_td( tf.keras.layers.Dense(action_space.n, name="{}_fc_{}".format( 'twin_vf', 'q_out'), activation=None, kernel_initializer=normc_initializer( 1.0))(twin_vf_last_layer)) self.pi_model = tf.keras.Model(inputs=[self.pi_obs_inputs], outputs=[unmasked_logits_out]) self.main_q_model = tf.keras.Model(inputs=[self.vf_obs_inputs], outputs=[value_out]) if self.twin_q: self.twin_q_model = tf.keras.Model(inputs=[self.vf_obs_inputs], outputs=[twin_value_out]) print(self.twin_q_model.summary()) self.register_variables(self.twin_q_model.variables) print(self.pi_model.summary()) print(self.main_q_model.summary()) self.register_variables(self.pi_model.variables) self.register_variables(self.main_q_model.variables) self.log_alpha = tf.Variable(0.0, dtype=tf.float32, name="log_alpha") self.alpha = tf.exp(self.log_alpha) self.register_variables([self.log_alpha])
def __init__(self, obs_space, action_space, num_outputs, model_config, name, q_hiddens=None, dueling=False, num_atoms=1, use_noisy=False, v_min=-10.0, v_max=10.0, sigma0=0.5, parameter_noise=False): if q_hiddens or dueling or num_atoms != 1 or use_noisy: raise NotImplementedError model_config = with_base_config( base_config=DEFAULT_STRATEGO_MODEL_CONFIG, extra_config=model_config) TFModelV2.__init__(self, obs_space, action_space, num_outputs, model_config, name) print(model_config) observation_mode = model_config['custom_options']['observation_mode'] if observation_mode == PARTIALLY_OBSERVABLE: self.vf_obs_key = 'partial_observation' elif observation_mode == FULLY_OBSERVABLE: self.vf_obs_key = 'full_observation' elif observation_mode == BOTH_OBSERVATIONS: raise ValueError( f"Using {BOTH_OBSERVATIONS} format doesn't make sense for a Q-network, there's no policy, just a Q-function" ) else: assert False, "policy observation_mode must be in [PARTIALLY_OBSERVABLE, FULLY_OBSERVABLE, BOTH_OBSERVATIONS]" if model_config["custom_preprocessor"]: print(obs_space) self.preprocessor = ModelCatalog.get_preprocessor_for_space( observation_space=self.obs_space.original_space, options=model_config) else: self.preprocessor = None logger.warn( "No custom preprocessor for StrategoModel was specified.\n" "Some tree search policies may not initialize their placeholders correctly without this." ) self.use_lstm = model_config['use_lstm'] self.vf_share_layers = model_config.get("vf_share_layers") self.mask_invalid_actions = model_config['custom_options'][ 'mask_invalid_actions'] conv_activation = get_activation_fn( model_config.get("conv_activation")) lstm_filters = model_config["custom_options"]['lstm_filters'] cnn_filters = model_config.get("conv_filters") final_pi_filter_amt = model_config["custom_options"][ "final_pi_filter_amt"] rows = obs_space.original_space[self.vf_obs_key].shape[0] colums = obs_space.original_space[self.vf_obs_key].shape[1] if self.use_lstm: self._lstm_state_shape = (rows, colums, lstm_filters[0][0]) # self._lstm_state_shape = (64,) if self.use_lstm: state_in = [ tf.keras.layers.Input(shape=self._lstm_state_shape, name="vf_lstm_h"), tf.keras.layers.Input(shape=self._lstm_state_shape, name="vf_lstm_c") ] seq_lens_in = tf.keras.layers.Input(shape=(), name="lstm_seq_in") self.vf_obs_inputs = tf.keras.layers.Input( shape=(None, *obs_space.original_space[self.vf_obs_key].shape), name="vf_observation") else: state_in, seq_lens_in = None, None self.vf_obs_inputs = tf.keras.layers.Input( shape=obs_space.original_space[self.vf_obs_key].shape, name="vf_observation") # if pi_cnn_filters is None: # assert False # # assuming board size will always remain the same for both pi and vf networks # if self.use_lstm: # single_obs_input_shape = self.pi_obs_inputs.shape.as_list()[2:] # else: # single_obs_input_shape = self.pi_obs_inputs.shape.as_list()[1:] # pi_cnn_filters = _get_filter_config(single_obs_input_shape) # # if v_cnn_filters is None: # assert False # # assuming board size will always remain the same for both pi and vf networks # if self.use_lstm: # single_obs_input_shape = self.pi_obs_inputs.shape.as_list()[2:] # else: # single_obs_input_shape = self.pi_obs_inputs.shape.as_list()[1:] # v_cnn_filters = _get_filter_config(single_obs_input_shape) def maybe_td(layer): if self.use_lstm: return tf.keras.layers.TimeDistributed(layer=layer, name=f"td_{layer.name}") else: return layer def build_primary_layers(prefix: str, obs_in: tf.Tensor, state_in: tf.Tensor): # encapsulated in a function to either be called once for shared policy/vf or twice for separate policy/vf _last_layer = obs_in for i, (out_size, kernel, stride) in enumerate(cnn_filters): _last_layer = maybe_td( tf.keras.layers.Conv2D(filters=out_size, kernel_size=kernel, strides=stride, activation=conv_activation, padding="same", name="{}_conv_{}".format( prefix, i)))(_last_layer) if parameter_noise: # assuming inputs shape (batch_size x w x h x channel) _last_layer = maybe_td( tf.keras.layers.LayerNormalization( axis=(1, 2), name=f"{prefix}_LayerNorm_{i}"))(_last_layer) state_out = state_in if self.use_lstm: for i, (out_size, kernel, stride) in enumerate(lstm_filters): if i > 0: raise NotImplementedError( "Only single lstm layers are implemented right now" ) _last_layer, *state_out = tf.keras.layers.ConvLSTM2D( filters=out_size, kernel_size=kernel, strides=stride, activation=conv_activation, padding="same", return_sequences=True, return_state=True, name="{}_convlstm".format(prefix))( inputs=_last_layer, mask=tf.sequence_mask(seq_lens_in), initial_state=state_in) raise NotImplementedError( "havent checked lstms for q model" "") return _last_layer, state_out if self.use_lstm: vf_state_in = state_in[2:] else: pi_state_in, vf_state_in = None, None vf_last_layer, vf_state_out = build_primary_layers( prefix="vf", obs_in=self.vf_obs_inputs, state_in=vf_state_in) if self.use_lstm: state_out = vf_state_out else: state_out = None vf_last_layer = maybe_td( tf.keras.layers.Conv2D(filters=final_pi_filter_amt, kernel_size=[3, 3], strides=1, activation=conv_activation, padding="same", name="{}_conv_{}".format( 'vf', "last")))(vf_last_layer) if parameter_noise: # assuming inputs shape (batch_size x w x h x channel) vf_last_layer = maybe_td( tf.keras.layers.LayerNormalization( axis=(1, 2), name=f"vf_LayerNorm_last"))(vf_last_layer) print( f"action space n: {action_space.n}, rows: {rows}, columns: {colums}, filters: {int(action_space.n / (rows * colums))}" ) unmasked_logits_out = maybe_td( tf.keras.layers.Conv2D( filters=int(action_space.n / (rows * colums)), kernel_size=[3, 3], strides=1, activation=None, padding="same", name="{}_conv_{}".format('vf', "unmasked_logits")))(vf_last_layer) # vf_last_layer = maybe_td(tf.keras.layers.Conv2D( # filters=1, # kernel_size=[1, 1], # strides=1, # activation=conv_activation, # padding="same", # name="{}_conv_{}".format('vf', "last")))(vf_last_layer) # # vf_last_layer = maybe_td(tf.keras.layers.Flatten(name="vf_flatten"))(vf_last_layer) # # value_out = maybe_td(tf.keras.layers.Dense( # units=1, # name="vf_out", # activation=None, # kernel_initializer=normc_initializer(0.01)))(vf_last_layer) model_inputs = [self.vf_obs_inputs] model_outputs = [unmasked_logits_out] if self.use_lstm: model_inputs += [seq_lens_in, *state_in] model_outputs += state_out self.base_model = tf.keras.Model(inputs=model_inputs, outputs=model_outputs) print(self.base_model.summary()) self.register_variables(self.base_model.variables)