def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm=256, reuse=False, layers=None, net_arch=None, layer_norm=False, feature_extraction="cnn", **kwargs): # state_shape = [n_lstm * 2] dim because of the cell and hidden states of the LSTM super(RelationalLstmPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, state_shape=(2 * n_lstm, ), reuse=reuse, scale=(feature_extraction == "cnn")) self._kwargs_check(feature_extraction, kwargs) with tf.variable_scope("model", reuse=reuse): print('self.processed_obs', self.processed_obs) relation_block_output = self.relation_block(self.processed_obs) # original code input_sequence = batch_to_seq(relation_block_output, self.n_env, n_steps) print('input_sequence', input_sequence) masks = batch_to_seq(self.dones_ph, self.n_env, n_steps) rnn_output, self.snew = lstm(input_sequence, masks, self.states_ph, 'lstm1', n_hidden=n_lstm, layer_norm=layer_norm) rnn_output = seq_to_batch(rnn_output) value_fn = linear(rnn_output, 'vf', 1) self._proba_distribution, self._policy, self.q_value = \ self.pdtype.proba_distribution_from_latent(rnn_output, rnn_output) self._value_fn = value_fn self._setup_init()
def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm=256, reuse=False, layers=None, cnn_extractor=nature_cnn, layer_norm=False, feature_extraction="cnn", **kwargs): super(LstmPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm, reuse, scale=(feature_extraction == "cnn")) if layers is None: layers = [64, 64] with tf.variable_scope("model", reuse=reuse): if feature_extraction == "cnn": extracted_features = cnn_extractor( (self.processed_obs, self.processed_obs_len), **kwargs) else: activ = tf.tanh extracted_features = tf.layers.flatten(self.processed_obs) for i, layer_size in enumerate(layers): extracted_features = activ( linear(extracted_features, 'pi_fc' + str(i), n_hidden=layer_size, init_scale=np.sqrt(2))) input_sequence = batch_to_seq(extracted_features, self.n_env, n_steps) masks = batch_to_seq(self.masks_ph, self.n_env, n_steps) rnn_output, self.snew = lstm(input_sequence, masks, self.states_ph, 'lstm1', n_hidden=n_lstm, layer_norm=layer_norm) rnn_output = seq_to_batch(rnn_output) value_fn = linear(rnn_output, 'vf', 1) self.proba_distribution, self.policy, self.q_value = \ self.pdtype.proba_distribution_from_latent(rnn_output, rnn_output) self.value_fn = value_fn self.initial_state = np.zeros((self.n_env, n_lstm * 2), dtype=np.float32) self._setup_init()
def __init__(self, sess, tasks, ob_spaces, ac_space_dict, n_envs_per_task, n_steps, reuse=False, feature_extractor=shared_network, layer_norm=True): super(MultiTaskLSTMA2CPolicy, self).__init__(sess, tasks, ob_spaces, ac_space_dict, n_envs_per_task, n_steps, reuse) self.n_lstm = config.n_lstm with tf.variable_scope("input", reuse=True): self.masks_ph = tf.placeholder(tf.float32, [None], name="masks_ph") # mask (done t-1) # n_lstm * 2 dim because of the cell and hidden states of the LSTM self.states_ph = tf.placeholder(tf.float32, [None, self.n_lstm * 2], name="states_ph") # states with tf.variable_scope("shared_model", reuse=reuse): extracted_features = feature_extractor(self.processed_obs) input_sequence = batch_to_seq( extracted_features, self.n_steps ) # n_steps x [n_env x feature extractor output shape] masks = batch_to_seq(self.masks_ph, self.n_steps) # n_steps x [n_env x 1] rnn_output, self.state_new = lstm( input_sequence, masks, self.states_ph, 'lstm1', n_hidden=self.n_lstm, layer_norm=layer_norm) # n_steps x [n_env x n_lstm] latent_vector = seq_to_batch( rnn_output) # (n_steps * n_envs) x n_lstm ac_space_union = tf.nn.relu( linear(latent_vector, 'fc-union', n_hidden=config.max_action_space, init_scale=np.sqrt(2))) for task in self.pdtype_dict.keys(): with tf.variable_scope(task + "_model", reuse=reuse): self.value_fn_dict[task] = linear(ac_space_union, 'vf', 1) proba_distribution, policy, q_value = self.pdtype_dict[task].\ proba_distribution_from_latent(ac_space_union, ac_space_union, init_scale=0.01) self.proba_distribution_dict[ task] = proba_distribution # distribution lehet vele sample neglog entropy a policy layeren self.policy_dict[task] = policy # egy linear layer self.q_value_dict[task] = q_value # linear layer self._setup_init()
def __init__(self, ob_space, ac_space, hidsize, ob_mean, ob_std, feat_dim, layernormalize, nl, n_env, n_steps, reuse, n_lstm=256, scope="policy"): super(RnnPolicy, self).__init__(ob_space, ac_space, hidsize, ob_mean, ob_std, feat_dim, layernormalize, nl, n_env, n_steps, reuse, n_lstm, scope) with tf.variable_scope(scope, reuse=self.reuse): ## Use features x = self.flat_features input_sequence = batch_to_seq(x, self.n_env, self.n_steps) masks = batch_to_seq(self.masks_ph, self.n_env, self.n_steps) rnn_output, self.snew = lstm(input_sequence, masks, self.states_ph, 'lstm1', n_hidden=n_lstm, layer_norm=False) rnn_output = seq_to_batch(rnn_output) layernorm(rnn_output) ## Concat q = self.flat_features q = tf.concat([q, rnn_output], axis=1) q = fc(q, units=hidsize, activation=activ, name="fc1") q = fc(q, units=hidsize, activation=activ, name="fc2") pdparam, vpred = self.get_pdparam(q) self.pdparam = pdparam = unflatten_first_dim(pdparam, self.sh) self.vpred = unflatten_first_dim(vpred, self.sh)[:, :, 0] self.pd = pd = self.ac_pdtype.proba_distribution_from_flat(pdparam) self.a_samp = pd.sample() self.entropy = pd.entropy() self.nlp_samp = pd.neglogp(self.a_samp)
def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm=256, reuse=False, layers=None, cnn_extractor=nature_cnn, layer_norm=False, feature_extraction="cnn", **kwargs): # super(LstmPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm, reuse, # scale=(feature_extraction == "cnn")) # add this function to LstmPolicy to init ActorCriticPolicy self.AC_init(sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm, reuse, feature_extraction) with tf.variable_scope("model", reuse=reuse): extracted_features = cnn_extractor(self.processed_x, **kwargs) # # [B,H,W,Deepth] print('extracted_features', extracted_features) coor = get_coor(extracted_features) # [B,Height,W,D+2] entities = tf.concat([extracted_features, coor], axis=3) print('entities:', entities) # [B,H*W,num_heads,Deepth=D+2] cin_output, attentions = CIN(entities, 'CIN') self.attention = attentions[0] print('CIN:', cin_output) # max_pooling # cin_maxpooling_output = tf.reduce_max(cin_output, axis=[1]) # cin_output = tf.reshape(cin_output, [-1, 588, 2 * 66]) maxpooling_shape = entities.shape[1] * entities.shape[2] print('maxpooling_shape:', maxpooling_shape) cin_maxpooling_output = tf.nn.pool(cin_output, window_shape=[maxpooling_shape], padding='VALID', strides=[maxpooling_shape], pooling_type="MAX") print('cin_maxpooling_output', cin_maxpooling_output) input_sequence = batch_to_seq(cin_maxpooling_output, self.n_env, n_steps) # input_sequence = batch_to_seq(extracted_features, self.n_env, n_steps) masks = batch_to_seq(self.masks_ph, self.n_env, n_steps) rnn_output, self.snew = lstm(input_sequence, masks, self.states_ph, 'lstm1', n_hidden=n_lstm, layer_norm=layer_norm) rnn_output = seq_to_batch(rnn_output) # print('rnn_output', rnn_output, ' snew', self.snew) value_fn = linear(rnn_output, 'vf', 1) self.proba_distribution, self.policy, self.q_value = \ self.pdtype.proba_distribution_from_latent(rnn_output, rnn_output) self.value_fn = value_fn self.initial_state = np.zeros((self.n_env, n_lstm * 2), dtype=np.float32) self._setup_init()
def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm=256, reuse=False, layers=None, net_arch=None, act_fun=tf.tanh, cnn_extractor=nature_cnn, layer_norm=False, feature_extraction="cnn", **kwargs): # state_shape = [n_lstm * 2] dim because of the cell and hidden states of the LSTM super(LstmPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, state_shape=(2 * n_lstm, ), reuse=reuse, scale=(feature_extraction == "cnn")) self._kwargs_check(feature_extraction, kwargs) if net_arch is None: # Legacy mode if layers is None: layers = [64, 64] else: warnings.warn( "The layers parameter is deprecated. Use the net_arch parameter instead." ) with tf.variable_scope("model", reuse=reuse): if feature_extraction == "cnn": extracted_features = cnn_extractor(self.processed_obs, **kwargs) else: extracted_features = tf.layers.flatten(self.processed_obs) for i, layer_size in enumerate(layers): extracted_features = act_fun( linear(extracted_features, 'pi_fc' + str(i), n_hidden=layer_size, init_scale=np.sqrt(2))) input_sequence = batch_to_seq(extracted_features, self.n_env, n_steps) masks = batch_to_seq(self.dones_ph, self.n_env, n_steps) rnn_output, self.snew = lstm(input_sequence, masks, self.states_ph, 'lstm1', n_hidden=n_lstm, layer_norm=layer_norm) rnn_output = seq_to_batch(rnn_output) value_fn = linear(rnn_output, 'vf', 1) self._proba_distribution, self._policy, self.q_value = \ self.pdtype.proba_distribution_from_latent(rnn_output, rnn_output) self._value_fn = value_fn else: # Use the new net_arch parameter if layers is not None: warnings.warn( "The new net_arch parameter overrides the deprecated layers parameter." ) if feature_extraction == "cnn": raise NotImplementedError() with tf.variable_scope("model", reuse=reuse): latent = tf.layers.flatten(self.processed_obs) policy_only_layers = [ ] # Layer sizes of the network that only belongs to the policy network value_only_layers = [ ] # Layer sizes of the network that only belongs to the value network # Iterate through the shared layers and build the shared parts of the network lstm_layer_constructed = False for idx, layer in enumerate(net_arch): if isinstance(layer, int): # Check that this is a shared layer layer_size = layer latent = act_fun( linear(latent, "shared_fc{}".format(idx), layer_size, init_scale=np.sqrt(2))) elif layer == "lstm": if lstm_layer_constructed: raise ValueError( "The net_arch parameter must only contain one occurrence of 'lstm'!" ) input_sequence = batch_to_seq(latent, self.n_env, n_steps) masks = batch_to_seq(self.dones_ph, self.n_env, n_steps) rnn_output, self.snew = lstm(input_sequence, masks, self.states_ph, 'lstm1', n_hidden=n_lstm, layer_norm=layer_norm) latent = seq_to_batch(rnn_output) lstm_layer_constructed = True else: assert isinstance( layer, dict ), "Error: the net_arch list can only contain ints and dicts" if 'pi' in layer: assert isinstance( layer['pi'], list ), "Error: net_arch[-1]['pi'] must contain a list of integers." policy_only_layers = layer['pi'] if 'vf' in layer: assert isinstance( layer['vf'], list ), "Error: net_arch[-1]['vf'] must contain a list of integers." value_only_layers = layer['vf'] break # From here on the network splits up in policy and value network # Build the non-shared part of the policy-network latent_policy = latent for idx, pi_layer_size in enumerate(policy_only_layers): if pi_layer_size == "lstm": raise NotImplementedError( "LSTMs are only supported in the shared part of the policy network." ) assert isinstance( pi_layer_size, int ), "Error: net_arch[-1]['pi'] must only contain integers." latent_policy = act_fun( linear(latent_policy, "pi_fc{}".format(idx), pi_layer_size, init_scale=np.sqrt(2))) # Build the non-shared part of the value-network latent_value = latent for idx, vf_layer_size in enumerate(value_only_layers): if vf_layer_size == "lstm": raise NotImplementedError( "LSTMs are only supported in the shared part of the value function " "network.") assert isinstance( vf_layer_size, int ), "Error: net_arch[-1]['vf'] must only contain integers." latent_value = act_fun( linear(latent_value, "vf_fc{}".format(idx), vf_layer_size, init_scale=np.sqrt(2))) if not lstm_layer_constructed: raise ValueError( "The net_arch parameter must contain at least one occurrence of 'lstm'!" ) self._value_fn = linear(latent_value, 'vf', 1) # TODO: why not init_scale = 0.001 here like in the feedforward self._proba_distribution, self._policy, self.q_value = \ self.pdtype.proba_distribution_from_latent(latent_policy, latent_value) self._setup_init()
def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm=256, reuse=False, layers=None, cnn_extractor=custom_cnn, layer_norm=False, feature_extraction="cnn", **kwargs): # super(LstmPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm, reuse, # scale=(feature_extraction == "cnn")) # add this function to LstmPolicy to init ActorCriticPolicy self.AC_init(sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm, reuse, feature_extraction) with tf.variable_scope("model", reuse=reuse): extracted_features = cnn_extractor(self.processed_x, **kwargs) # vectors v_t print('extracted_features', extracted_features) last_num_height = extracted_features.shape[1] last_num_width = extracted_features.shape[2] # print(last_width) last_num_features = extracted_features.shape[3] n_hiddens = 42 x2 = tf.reshape( extracted_features, [-1, last_num_height * last_num_width, last_num_features]) print('x2', x2) x3 = tf.nn.relu( conv(extracted_features, 'x3', n_filters=n_hiddens, filter_size=1, stride=1, init_scale=np.sqrt(2), **kwargs)) print('x3', x3) print('states', self.states_ph) # ob = [envs,steps] -- rnn_state = [envs]*steps h0 = tf.expand_dims(self.states_ph, 1) h0 = tf.tile(h0, [1, self.n_steps, 1]) print('h0', h0) h0 = tf.reshape(h0, [-1, h0.shape[2]]) print('h0', h0) h1 = linear_without_bias(h0, 'fc_h1', n_hidden=n_hiddens, init_scale=np.sqrt(2)) print('h1', h1) # replicate [1,n_hiddens] to [1,22*16,n_hiddens] h2 = tf.expand_dims(h1, 1) h2 = tf.tile(h2, [1, last_num_height * last_num_width, 1]) print('h2', h2) h3 = tf.reshape(h2, [-1, last_num_height, last_num_width, n_hiddens]) print('h3', h3) a1 = tf.nn.tanh(tf.add(h3, x3)) a2 = tf.nn.relu( conv(a1, 'a2', n_filters=1, filter_size=1, stride=1, init_scale=np.sqrt(2), **kwargs)) print('a2', a2) a3 = tf.nn.softmax( tf.reshape(a2, [-1, last_num_height * last_num_width])) # attetion print('a3', a3) self.attention = a3 a4 = tf.expand_dims(a3, 2) a4 = tf.tile(a4, [1, 1, last_num_features]) print('a4', a4) context = tf.reduce_sum(tf.multiply(a4, x2), 2) print('context', context) input_sequence = batch_to_seq(context, self.n_env, n_steps) # input_sequence = batch_to_seq(extracted_features, self.n_env, n_steps) masks = batch_to_seq(self.masks_ph, self.n_env, n_steps) rnn_output, self.snew = lstm(input_sequence, masks, self.states_ph, 'lstm1', n_hidden=n_lstm, layer_norm=layer_norm) rnn_output = seq_to_batch(rnn_output) # print('rnn_output', rnn_output, ' snew', self.snew) value_fn = linear(rnn_output, 'vf', 1) self.proba_distribution, self.policy, self.q_value = \ self.pdtype.proba_distribution_from_latent(rnn_output, rnn_output) self.value_fn = value_fn self.initial_state = np.zeros((self.n_env, n_lstm * 2), dtype=np.float32) self._setup_init()
def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm=4, reuse=False, layers=None, net_arch=None, act_fun=tf.tanh, cnn_extractor=custom_cnn, layer_norm=True, feature_extraction="cnn", params=None, **kwargs): super(CustomCnnLnLstmPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, state_shape=(2 * n_lstm, ), reuse=reuse, scale=(feature_extraction == "cnn")) config = params init_scale = params['pd_init_scale'] activ = getattr(tf.nn, params['activ']) initializer = getattr(tf, params['kernel_initializer']) self._kwargs_check(feature_extraction, kwargs) net_arch = config['shared'] net_arch.append(dict(pi=config['h_actor'], vf=config['h_critic'])) if net_arch is None: # Legacy mode if layers is None: layers = [64, 64] with tf.variable_scope("model", reuse=reuse): if feature_extraction == "cnn": extracted_features = cnn_extractor(self.processed_obs, params) else: extracted_features = tf.layers.flatten(self.processed_obs) for i, layer_size in enumerate(layers): extracted_features = act_fun(linear(extracted_features, 'pi_fc' + str(i), n_hidden=layer_size, init_scale=np.sqrt(2))) input_sequence = batch_to_seq(extracted_features, self.n_env, n_steps) masks = batch_to_seq(self.dones_ph, self.n_env, n_steps) rnn_output, self.snew = lstm(input_sequence, masks, self.states_ph, 'lstm1', n_hidden=n_lstm, layer_norm=layer_norm) rnn_output = seq_to_batch(rnn_output) value_fn = linear(rnn_output, 'vf', 1) self._proba_distribution, self._policy, self.q_value = \ self.pdtype.proba_distribution_from_latent(rnn_output, rnn_output) self._value_fn = value_fn else: # Use the new net_arch parameter with tf.variable_scope("model", reuse=reuse): extracted_features = cnn_extractor(self.processed_obs, params) latent = tf.layers.flatten(extracted_features) policy_only_layers = [] value_only_layers = [] lstm_layer_constructed = False for idx, layer in enumerate(net_arch): if isinstance(layer, int): layer_size = layer latent = act_fun(linear(latent, "shared_fc{}".format(idx), layer_size, init_scale=np.sqrt(2))) elif layer == "lstm": if lstm_layer_constructed: raise ValueError("The net_arch parameter must only contain one occurrence of 'lstm'!") input_sequence = batch_to_seq(latent, self.n_env, n_steps) masks = batch_to_seq(self.dones_ph, self.n_env, n_steps) rnn_output, self.snew = lstm(input_sequence, masks, self.states_ph, 'lstm1', n_hidden=n_lstm, layer_norm=layer_norm) latent = seq_to_batch(rnn_output) lstm_layer_constructed = True else: assert isinstance(layer, dict), "Error: the net_arch list can only contain ints and dicts" if 'pi' in layer: assert isinstance(layer['pi'], list), "Error: net_arch[-1]['pi'] must contain a list of integers." policy_only_layers = layer['pi'] if 'vf' in layer: assert isinstance(layer['vf'], list), "Error: net_arch[-1]['vf'] must contain a list of integers." value_only_layers = layer['vf'] break latent_policy = latent for idx, pi_layer_size in enumerate(policy_only_layers): if pi_layer_size == "lstm": raise NotImplementedError("LSTMs are only supported in the shared part of the policy network.") assert isinstance(pi_layer_size, int), "Error: net_arch[-1]['pi'] must only contain integers." latent_policy = act_fun( linear(latent_policy, "pi_fc{}".format(idx), pi_layer_size, init_scale=np.sqrt(2))) latent_value = latent for idx, vf_layer_size in enumerate(value_only_layers): if vf_layer_size == "lstm": raise NotImplementedError("LSTMs are only supported in the shared part of the value function " "network.") assert isinstance(vf_layer_size, int), "Error: net_arch[-1]['vf'] must only contain integers." latent_value = act_fun( linear(latent_value, "vf_fc{}".format(idx), vf_layer_size, init_scale=np.sqrt(2))) if not lstm_layer_constructed: raise ValueError("The net_arch parameter must contain at least one occurrence of 'lstm'!") self._value_fn = linear(latent_value, 'vf', 1) self._proba_distribution, self._policy, self.q_value = \ self.pdtype.proba_distribution_from_latent(latent_policy, latent_value) self._setup_init()
def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm=150, reuse=False, layers=None, goal_num=1, goal_net_arch=None, net_arch=None, act_fun=tf.tanh, cnn_extractor=nature_cnn, layer_norm=False, goal_encoder='mlp', feature_extraction="mlp", **kwargs): # state_shape = [n_lstm * 2] dim because of the cell and hidden states of the LSTM super(GoalsConditionedLSTMPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, state_shape=(2 * n_lstm, ), reuse=reuse, scale=(feature_extraction == "mlp")) self.goal_encoder = goal_encoder # self._kwargs_check(feature_extraction, kwargs) self.name = "lstm_policy_" + goal_encoder with tf.variable_scope("model", reuse=tf.AUTO_REUSE): self.obs_goals = tf.placeholder(dtype=ob_space.dtype, shape=(None, ob_space.shape[0]), name='goal_states') obs_goals_reshape = self.obs_goals #tf.reshape(tensor=self.obs_goals, shape=(-1, self.goal_num * ob_space.shape[0])) if goal_encoder == "mlp_sample": logging.info('mlp encoder with z sampling') self.z_mu, self.z_log_sigma_sq = mlp_goal_encoder( obs_goals_reshape, goal_net_arch, act_fun) eps = tf.random_normal(shape=tf.shape(self.z_log_sigma_sq), mean=0, stddev=1, dtype=tf.float32) self.z_goal_sample = self.z_mu + tf.sqrt( tf.exp(self.z_log_sigma_sq)) * eps if goal_encoder == "mlp": logging.info('mlp encoder with z mu') self.z_mu, self.z_log_sigma_sq = mlp_goal_encoder( obs_goals_reshape, goal_net_arch, act_fun) self.z_goal_sample = self.z_mu if goal_encoder == "no_encoder" or goal_encoder == 'no_goal_proposing': self.z_goal_sample = tf.stop_gradient(self.obs_goals) self.z_goal_input = tf.placeholder(dtype=ob_space.dtype, shape=self.z_goal_sample.shape, name='input_z_goal') self.use_input_z = tf.placeholder_with_default(False, shape=(), name='use_input_z') def use_sample(): return self.z_goal_sample def use_input(): return self.z_goal_input self.z_goal = tf.cond(self.use_input_z, use_input, use_sample) if goal_encoder == 'no_goal_proposing': latent = tf.layers.flatten(self.processed_obs) else: latent = tf.concat( [tf.layers.flatten(self.processed_obs), self.z_goal], 1) logging.info('latent shape %f' % latent.shape) if net_arch is None: # Legacy mode if layers is None: layers = [64, 64] else: warnings.warn( "The layers parameter is deprecated. Use the net_arch parameter instead." ) if feature_extraction == "cnn": extracted_features = cnn_extractor(self.processed_obs, **kwargs) else: extracted_features = latent #tf.layers.flatten(self.processed_obs) for i, layer_size in enumerate(layers): extracted_features = act_fun( linear(extracted_features, 'pi_fc' + str(i), n_hidden=layer_size, init_scale=np.sqrt(2))) input_sequence = batch_to_seq(extracted_features, self.n_env, n_steps) masks = batch_to_seq(self.dones_ph, self.n_env, n_steps) rnn_output, self.snew = lstm(input_sequence, masks, self.states_ph, 'lstm1', n_hidden=n_lstm, layer_norm=layer_norm) rnn_output = seq_to_batch(rnn_output) value_fn = linear(rnn_output, 'vf', 1) self._proba_distribution, self._policy, self.q_value = \ self.pdtype.proba_distribution_from_latent(rnn_output, rnn_output) self._value_fn = value_fn else: # Use the new net_arch parameter if layers is not None: warnings.warn( "The new net_arch parameter overrides the deprecated layers parameter." ) if feature_extraction == "cnn": raise NotImplementedError() # latent = tf.layers.flatten(self.processed_obs) policy_only_layers = [ ] # Layer sizes of the network that only belongs to the policy network value_only_layers = [ ] # Layer sizes of the network that only belongs to the value network # Iterate through the shared layers and build the shared parts of the network lstm_layer_constructed = False for idx, layer in enumerate(net_arch): if isinstance(layer, int): # Check that this is a shared layer layer_size = layer latent = act_fun( linear(latent, "shared_fc{}".format(idx), layer_size, init_scale=np.sqrt(2))) elif layer == "lstm": if lstm_layer_constructed: raise ValueError( "The net_arch parameter must only contain one occurrence of 'lstm'!" ) input_sequence = batch_to_seq(latent, self.n_env, n_steps) masks = batch_to_seq(self.dones_ph, self.n_env, n_steps) rnn_output, self.snew = lstm(input_sequence, masks, self.states_ph, 'lstm1', n_hidden=n_lstm, layer_norm=layer_norm) latent = seq_to_batch(rnn_output) lstm_layer_constructed = True else: assert isinstance( layer, dict ), "Error: the net_arch list can only contain ints and dicts" if 'pi' in layer: assert isinstance( layer['pi'], list ), "Error: net_arch[-1]['pi'] must contain a list of integers." policy_only_layers = layer['pi'] if 'vf' in layer: assert isinstance( layer['vf'], list ), "Error: net_arch[-1]['vf'] must contain a list of integers." value_only_layers = layer['vf'] break # From here on the network splits up in policy and value network # Build the non-shared part of the policy-network latent_policy = latent for idx, pi_layer_size in enumerate(policy_only_layers): if pi_layer_size == "lstm": raise NotImplementedError( "LSTMs are only supported in the shared part of the policy network." ) assert isinstance( pi_layer_size, int ), "Error: net_arch[-1]['pi'] must only contain integers." latent_policy = act_fun( linear(latent_policy, "pi_fc{}".format(idx), pi_layer_size, init_scale=np.sqrt(2))) # Build the non-shared part of the value-network latent_value = latent for idx, vf_layer_size in enumerate(value_only_layers): if vf_layer_size == "lstm": raise NotImplementedError( "LSTMs are only supported in the shared part of the value function " "network.") assert isinstance( vf_layer_size, int ), "Error: net_arch[-1]['vf'] must only contain integers." latent_value = act_fun( linear(latent_value, "vf_fc{}".format(idx), vf_layer_size, init_scale=np.sqrt(2))) if not lstm_layer_constructed: raise ValueError( "The net_arch parameter must contain at least one occurrence of 'lstm'!" ) self._value_fn = linear(latent_value, 'vf', 1) # TODO: why not init_scale = 0.001 here like in the feedforward self._proba_distribution, self._policy, self.q_value = \ self.pdtype.proba_distribution_from_latent(latent_policy, latent_value) if goal_encoder == "mlp_sample": kl_coef = 0.01 latent_loss = -0.5 * tf.reduce_sum( 1 + self.z_log_sigma_sq - tf.square(self.z_mu) - tf.exp(self.z_log_sigma_sq), axis=1) self.latent_loss = tf.reduce_mean(latent_loss) * kl_coef else: self.latent_loss = 0 self._setup_init()
def __init__(self, ob_space, ac_space, hidsize, ob_mean, ob_std, feat_dim, layernormalize, nl, n_env, n_steps, reuse, n_lstm=256, scope="policy"): super(ErrorPredRnnPolicy, self).__init__(ob_space, ac_space, hidsize, ob_mean, ob_std, feat_dim, layernormalize, nl, n_env, n_steps, reuse, n_lstm, scope) with tf.variable_scope(scope): self.flat_masks_ph = tf.reshape(self.masks_ph, [self.n_env * self.n_steps]) self.pred_error = tf.placeholder( dtype=tf.float32, shape=(self.n_env, self.n_steps, self.hidsize), name='pred_error') # prediction error self.flat_pred_error = flatten_two_dims(self.pred_error) self.obs_pred = tf.placeholder(dtype=tf.float32, shape=(self.n_env, self.n_steps, self.hidsize), name='obs_pred') self.flat_obs_pred = flatten_two_dims(self.obs_pred) with tf.variable_scope(scope, reuse=self.reuse): x = tf.concat([ self.flat_features, self.flat_obs_pred, self.flat_pred_error ], axis=1) input_sequence = batch_to_seq(x, self.n_env, self.n_steps) masks = batch_to_seq(self.masks_ph, self.n_env, self.n_steps) rnn_output, self.snew = lstm(input_sequence, masks, self.states_ph, 'lstm1', n_hidden=n_lstm, layer_norm=False) rnn_output = seq_to_batch(rnn_output) rnn_output = layernorm(rnn_output) ## Concat q = self.flat_features q = tf.concat([q, rnn_output], axis=1) q = fc(q, units=hidsize, activation=activ, name="fc1") q = fc(q, units=hidsize, activation=activ, name="fc2") pdparam, vpred = self.get_pdparam(q) self.pdparam = pdparam = unflatten_first_dim(pdparam, self.sh) self.vpred = unflatten_first_dim(vpred, self.sh)[:, :, 0] self.pd = pd = self.ac_pdtype.proba_distribution_from_flat(pdparam) self.a_samp = pd.sample() self.entropy = pd.entropy() self.nlp_samp = pd.neglogp(self.a_samp)
def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm=256, reuse=False, layers=None, cnn_extractor=nature_cnn, layer_norm=False, feature_extraction="cnn", **kwargs): # super(LstmPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm, reuse, # scale=(feature_extraction == "cnn")) # add this function to LstmPolicy to init ActorCriticPolicy self.AC_init(sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm, reuse, feature_extraction) with tf.variable_scope("model", reuse=reuse): extracted_features = cnn_extractor(self.processed_x, **kwargs) # # [B,H,W,Deepth] print('extracted_features', extracted_features) coor = get_coor(extracted_features) # [B,Height,W,D+2] entities = tf.concat([extracted_features, coor], axis=3) print('entities:', entities) # [B,H*W,num_heads,Deepth=D+2] MHDPA_output, weights = MHDPA(entities, "extracted_features", num_heads=2) print('MHDPA_output', MHDPA_output) self.attention = weights # [B,H*W,num_heads,Deepth] residual_output = residual_block(entities, MHDPA_output) print('residual_output', residual_output) # max_pooling residual_maxpooling_output = tf.reduce_max(residual_output, axis=[1]) print('residual_maxpooling_output', residual_maxpooling_output) input_sequence = batch_to_seq(residual_maxpooling_output, self.n_env, n_steps) # input_sequence = batch_to_seq(extracted_features, self.n_env, n_steps) masks = batch_to_seq(self.masks_ph, self.n_env, n_steps) rnn_output, self.snew = lstm(input_sequence, masks, self.states_ph, 'lstm1', n_hidden=n_lstm, layer_norm=layer_norm) rnn_output = seq_to_batch(rnn_output) # print('rnn_output', rnn_output, ' snew', self.snew) value_fn = linear(rnn_output, 'vf', 1) self.proba_distribution, self.policy, self.q_value = \ self.pdtype.proba_distribution_from_latent(rnn_output, rnn_output) self.value_fn = value_fn self.initial_state = np.zeros((self.n_env, n_lstm * 2), dtype=np.float32) self._setup_init()
def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm=128, reuse=False, layers=None, layer_norm=False, feature_extraction="not cnn", **kwargs): # super(LstmPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm, reuse, # scale=(feature_extraction == "cnn")) # add this function to LstmPolicy to init ActorCriticPolicy self.AC_init(sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm, reuse, feature_extraction) with tf.variable_scope("model", reuse=reuse): print('self.processed_x', self.processed_x) batch_size = self.n_env * self.n_steps past_frame_num = 8 activ = tf.nn.relu n_hiddens = 128 # throughput, download_time, chunk_size, buffer_size, last_bit_rate, rebuf, play_time_len, end_delay throughput = self.processed_x[:, :past_frame_num] throughput = tf.reshape(throughput, [-1, past_frame_num, 1]) throughput_conv = activ( conv1d(throughput, scope='throughput_conv1d')) download_time = self.processed_x[:, past_frame_num:2 * past_frame_num] download_time = tf.reshape(download_time, [-1, past_frame_num, 1]) download_time_conv = activ( conv1d(download_time, scope='download_time_conv1d')) chunk_size = self.processed_x[:, 2 * past_frame_num:2 * past_frame_num + 4] chunk_size = tf.reshape(chunk_size, [-1, 4, 1]) chunk_size_conv = activ( conv1d(chunk_size, scope='chunk_size_conv1d')) buffer_size = self.processed_x[:, 2 * past_frame_num + 4:2 * past_frame_num + 5] buffer_size_dense = activ( linear(buffer_size, scope='buffer_size_dense', n_hidden=n_hiddens)) last_bit_rate = self.processed_x[:, 2 * past_frame_num + 5:2 * past_frame_num + 6] last_bit_rate_dense = activ( linear(last_bit_rate, scope='last_bit_rate_dense', n_hidden=n_hiddens)) end_delay = self.processed_x[:, 2 * past_frame_num + 6:2 * past_frame_num + 7] end_delay_dense = activ( linear(end_delay, scope='end_delay_dense', n_hidden=n_hiddens)) input = [ tf.reshape(throughput_conv, [batch_size, -1]), tf.reshape(download_time_conv, [batch_size, -1]), tf.reshape(chunk_size_conv, [batch_size, -1]), buffer_size_dense, last_bit_rate_dense, end_delay_dense ] input = tf.concat(input, axis=1) print('input', input) input_sequence = batch_to_seq(input, self.n_env, n_steps) masks = batch_to_seq(self.masks_ph, self.n_env, n_steps) rnn_output, self.snew = lstm(input_sequence, masks, self.states_ph, 'lstm1', n_hidden=n_lstm, layer_norm=layer_norm) rnn_output = seq_to_batch(rnn_output) value_fn = linear(rnn_output, 'vf', 1) self.proba_distribution, self.policy, self.q_value = self.pdtype.proba_distribution_from_latent( rnn_output, rnn_output) self.value_fn = value_fn self.initial_state = np.zeros((self.n_env, n_lstm * 2), dtype=np.float32) self._setup_init()