def create_model(self, model_info): """Create Deep-Q network.""" state = Input(shape=self.state_dim) denselayer = Dense(HIDDEN_SIZE, activation='relu')(state) for _ in range(NUM_LAYERS - 1): denselayer = Dense(HIDDEN_SIZE, activation='relu')(denselayer) value = Dense(self.action_dim, activation='linear')(denselayer) if self.dueling: adv = Dense(1, activation='linear')(denselayer) mean = Lambda(layer_normalize)(value) value = Lambda(layer_add)([adv, mean]) model = Model(inputs=state, outputs=value) adam = Adam(lr=self.learning_rate) model.compile(loss='mse', optimizer=adam) self.infer_state = tf.placeholder(tf.float32, name="infer_input", shape=(None, ) + tuple(self.state_dim)) self.infer_v = model(self.infer_state) self.actor_var = TFVariables([self.infer_v], self.sess) self.sess.run(tf.initialize_all_variables()) return model
def create_model(self, model_info): """Create Deep-Q CNN network.""" state = Input(shape=self.state_dim, dtype="uint8") state1 = Lambda(lambda x: K.cast(x, dtype='float32') / 255.)(state) convlayer = Conv2D(32, (8, 8), strides=(4, 4), activation='relu', padding='valid')(state1) convlayer = Conv2D(64, (4, 4), strides=(2, 2), activation='relu', padding='valid')(convlayer) convlayer = Conv2D(64, (3, 3), strides=(1, 1), activation='relu', padding='valid')(convlayer) flattenlayer = Flatten()(convlayer) denselayer = Dense(256, activation='relu')(flattenlayer) value = Dense(self.action_dim, activation='linear')(denselayer) model = Model(inputs=state, outputs=value) adam = Adam(lr=self.learning_rate, clipnorm=10.) model.compile(loss='mse', optimizer=adam) if model_info.get("summary"): model.summary() self.infer_state = tf.placeholder(tf.uint8, name="infer_input", shape=(None, ) + tuple(self.state_dim)) self.infer_v = model(self.infer_state) self.actor_var = TFVariables([self.infer_v], self.sess) self.sess.run(tf.initialize_all_variables()) return model
def create_model(self, model_info): state_input = Input(shape=self.state_dim, name='state_input', dtype='uint8') state_input_1 = Lambda(layer_function)(state_input) advantage = Input(shape=(1, ), name='adv') convlayer = Conv2D(32, (8, 8), strides=(4, 4), activation='relu', padding='valid')(state_input_1) convlayer = Conv2D(64, (4, 4), strides=(2, 2), activation='relu', padding='valid')(convlayer) convlayer = Conv2D(64, (3, 3), strides=(1, 1), activation='relu', padding='valid')(convlayer) flattenlayer = Flatten()(convlayer) denselayer = Dense(256, activation='relu')(flattenlayer) out_actions = Dense(self.action_dim, activation='softmax', name='output_actions')(denselayer) out_value = Dense(1, name='output_value')(denselayer) model = Model(inputs=[state_input, advantage], outputs=[out_actions, out_value]) losses = {"output_actions": impala_loss(advantage), "output_value": 'mse'} lossweights = {"output_actions": 1.0, "output_value": .5} decay_value = 0.00000000512 model.compile(optimizer=Adam(lr=LR, clipnorm=40., decay=decay_value), loss=losses, loss_weights=lossweights) self.infer_state = tf.placeholder(tf.uint8, name="infer_state", shape=(None,) + tuple(self.state_dim)) self.adv = tf.placeholder(tf.float32, name="adv", shape=(None, 1)) self.infer_p, self.infer_v = model([self.infer_state, self.adv]) self.sess.run(tf.initialize_all_variables()) return model
def create_rep_network(self): obs = Input(shape=self.state_dim, name='rep_input') obs_1 = Lambda(lambda x: tf.cast(x, dtype='float32') / 255.)(obs) convlayer = Conv2D(32, (8, 8), strides=(4, 4), activation='relu', padding='valid')(obs_1) convlayer = Conv2D(32, (4, 4), strides=(2, 2), activation='relu', padding='valid')(convlayer) convlayer = Conv2D(64, (3, 3), strides=(1, 1), activation='relu', padding='valid')(convlayer) flattenlayer = Flatten()(convlayer) denselayer = Dense(HIDDEN_OUT, activation='relu')(flattenlayer) # hidden = Lambda(hidden_normlize)(denselayer) hidden = denselayer return Model(inputs=obs, outputs=hidden)
def get_cnn_backbone(state_dim, act_dim, hidden_sizes, activation, filter_arches, vf_share_layers=True, summary=False, dtype='uint8'): """Get CNN backbone.""" state_input_raw = Input(shape=state_dim, name='obs') if dtype == 'uint8': state_input = Lambda(layer_function)(state_input_raw) elif dtype == 'float32': state_input = state_input_raw else: raise ValueError( 'dtype: {} not supported automatically, please implement it yourself' .format(dtype)) if vf_share_layers: conv_layer = build_conv_layers(state_input, filter_arches, activation, 'shared') flatten_layer = Flatten()(conv_layer) dense_layer = bulid_mlp_layers(flatten_layer, hidden_sizes, activation, 'shared') pi_latent = Dense(act_dim, activation=None, name='pi_latent')(dense_layer) out_value = Dense(1, activation=None, name='output_value')(dense_layer) else: conv_layer_pi = build_conv_layers(state_input, filter_arches, activation, 'pi') conv_layer_v = build_conv_layers(state_input, filter_arches, activation, 'v') flatten_layer_pi = Flatten()(conv_layer_pi) flatten_layer_v = Flatten()(conv_layer_v) dense_layer_pi = bulid_mlp_layers(flatten_layer_pi, hidden_sizes, activation, 'pi') dense_layer_v = bulid_mlp_layers(flatten_layer_v, hidden_sizes, activation, 'v') pi_latent = Dense(act_dim, activation=None, name='pi_latent')(dense_layer_pi) out_value = Dense(1, activation=None, name='output_value')(dense_layer_v) model = Model(inputs=[state_input_raw], outputs=[pi_latent, out_value]) if summary: model.summary() return model
def create_model(self, model_info): state_input = Input(shape=self.state_dim, name='state_input', dtype='uint8') state_input_1 = Lambda(layer_function)(state_input) convlayer = Conv2D(32, (8, 8), strides=(4, 4), activation='relu', padding='valid')(state_input_1) convlayer = Conv2D(32, (4, 4), strides=(2, 2), activation='relu', padding='valid')(convlayer) convlayer = Conv2D(64, (3, 3), strides=(1, 1), activation='relu', padding='valid')(convlayer) flattenlayer = Flatten()(convlayer) denselayer = Dense(256, activation='relu', name='dense_1')(flattenlayer) out_actions = Dense(self.action_dim, activation='softmax', name='output_actions_raw')(denselayer) out_value = Dense(1, name='output_value')(denselayer) model = Model(inputs=[state_input], outputs=[out_actions, out_value]) self.build_graph(np.uint8, model) return model
def get_cnn_backbone(state_dim, act_dim, hidden_sizes, activation, filter_arches, vf_share_layers=True, summary=False): """Get CNN backbone.""" state_input_raw = Input(shape=state_dim, name='obs') state_input = Lambda(layer_function)(state_input_raw) if vf_share_layers: conv_layer = build_conv_layers(state_input, filter_arches, activation, 'shared') flatten_layer = Flatten()(conv_layer) dense_layer = bulid_mlp_layers(flatten_layer, hidden_sizes, activation, 'shared') pi_latent = Dense(act_dim, activation=None, name='pi_latent')(dense_layer) out_value = Dense(1, activation=None, name='output_value')(dense_layer) else: conv_layer_pi = build_conv_layers(state_input, filter_arches, activation, 'pi') conv_layer_v = build_conv_layers(state_input, filter_arches, activation, 'v') flatten_layer_pi = Flatten()(conv_layer_pi) flatten_layer_v = Flatten()(conv_layer_v) dense_layer_pi = bulid_mlp_layers(flatten_layer_pi, hidden_sizes, activation, 'pi') dense_layer_v = bulid_mlp_layers(flatten_layer_v, hidden_sizes, activation, 'v') pi_latent = Dense(act_dim, activation=None, name='pi_latent')(dense_layer_pi) out_value = Dense(1, activation=None, name='output_value')(dense_layer_v) model = Model(inputs=[state_input_raw], outputs=[pi_latent, out_value]) if summary: model.summary() return model
def create_model(self, model_info): self.ph_state = tf.placeholder(self.input_dtype, shape=(None, *self.state_dim), name="state_input") with tf.variable_scope("explore_agent"): state_input = Lambda(self._transform)(self.ph_state) last_layer = state_input for (out_size, kernel, stride) in self.filter_arch[:-1]: last_layer = Conv2D( out_size, (kernel, kernel), strides=(stride, stride), activation="relu", padding="same", )(last_layer) # last convolution (out_size, kernel, stride) = self.filter_arch[-1] convolution_layer = Conv2D( out_size, (kernel, kernel), strides=(stride, stride), activation="relu", padding="valid", )(last_layer) self.pi_logic_outs = tf.squeeze( Conv2D(self.action_dim, (1, 1), padding="same")(convolution_layer), axis=[1, 2], ) baseline_flat = Flatten()(convolution_layer) self.baseline = tf.squeeze( tf.layers.dense( inputs=baseline_flat, units=1, activation=None, kernel_initializer=custom_norm_initializer(0.01), ), 1, ) self.out_actions = tf.squeeze( tf.multinomial(self.pi_logic_outs, num_samples=1, output_dtype=tf.int32), 1, name="out_action", ) # create learner self.ph_bp_logic_outs = tf.placeholder(self.dtype, shape=(None, self.action_dim), name="ph_b_logits") self.ph_actions = tf.placeholder(tf.int32, shape=(None, ), name="ph_action") self.ph_dones = tf.placeholder(tf.bool, shape=(None, ), name="ph_dones") self.ph_rewards = tf.placeholder(self.dtype, shape=(None, ), name="ph_rewards") # Split the tensor into batches at known episode cut boundaries. # [batch_count * batch_step] -> [batch_step, batch_count] batch_step = self.sample_batch_steps def split_batches(tensor, drop_last=False): batch_count = tf.shape(tensor)[0] // batch_step reshape_tensor = tf.reshape( tensor, tf.concat([[batch_count, batch_step], tf.shape(tensor)[1:]], axis=0), ) # swap B and T axes res = tf.transpose( reshape_tensor, [1, 0] + list(range(2, 1 + int(tf.shape(tensor).shape[0]))), ) if drop_last: return res[:-1] return res self.loss = vtrace_loss( bp_logic_outs=split_batches(self.ph_bp_logic_outs, drop_last=True), tp_logic_outs=split_batches(self.pi_logic_outs, drop_last=True), actions=split_batches(self.ph_actions, drop_last=True), discounts=split_batches(tf.cast(~self.ph_dones, tf.float32) * GAMMA, drop_last=True), rewards=split_batches(tf.clip_by_value(self.ph_rewards, -1, 1), drop_last=True), values=split_batches(self.baseline, drop_last=True), bootstrap_value=split_batches(self.baseline)[-1], ) global_step = tf.Variable(0, trainable=False, dtype=tf.int32) if self.opt_type == "adam": if self.lr_schedule: learning_rate = self._get_lr(global_step) else: learning_rate = LR optimizer = AdamOptimizer(learning_rate) elif self.opt_type == "rmsprop": optimizer = tf.train.RMSPropOptimizer(LR, decay=0.99, epsilon=0.1, centered=True) else: raise KeyError("invalid opt_type: {}".format(self.opt_type)) grads_and_vars = optimizer.compute_gradients(self.loss) # global norm grads, var = zip(*grads_and_vars) grads, _ = tf.clip_by_global_norm(grads, self.grad_norm_clip) clipped_gvs = list(zip(grads, var)) self.train_op = optimizer.apply_gradients(clipped_gvs, global_step=global_step) # fixme: help to show the learning rate among training processing self.lr = optimizer._lr self.actor_var = TFVariables(self.out_actions, self.sess) self.sess.run(global_variables_initializer()) self.explore_paras = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope="explore_agent") self.saver = Saver({t.name: t for t in self.explore_paras}, max_to_keep=self.max_to_keep) return True