def __init__(self, ob_space, ac_space, replay_size=2000, grid_size=20):
        self.x = x = tf.placeholder(tf.float32, [None] + list(ob_space))
        self.action = tf.placeholder(tf.float32, [None, ac_space])
        self.reward = tf.placeholder(tf.float32, [None, 1])
        self.bs = tf.placeholder(dtype=tf.int32)
        self.replay_memory = []
        self.replay_size = replay_size
        self.grid_size = grid_size

        x = tf.nn.relu(conv2d(x, 16, "l1", [8, 8], [4, 4]))
        x = conv_features = tf.nn.relu(conv2d(x, 32, "l2", [4, 4], [2, 2]))
        x = flatten(x)
        x = tf.nn.relu(
            linear(x, 256, "l3", normalized_columns_initializer(0.1)))
        x = tf.concat(concat_dim=1, values=[x, self.action, self.reward])
        # introduce a "fake" batch dimension of 1 after flatten so that we can do LSTM over time dim
        x = tf.expand_dims(x, [0])

        size = 256
        lstm = GridPredictionLSTMCell(size,
                                      state_is_tuple=True,
                                      ac_space=ac_space,
                                      grid_size=20)
        self.state_size = lstm.state_size
        step_size = tf.shape(self.x)[:1]

        c_init = np.zeros((1, lstm.state_size.c), np.float32)
        h_init = np.zeros((1, lstm.state_size.h), np.float32)
        pred_init = np.zeros((1, lstm.state_size.pred), np.float32)
        self.state_init = [c_init, h_init, pred_init]
        c_in = tf.placeholder(tf.float32, [1, lstm.state_size.c])
        h_in = tf.placeholder(tf.float32, [1, lstm.state_size.h])
        pred_in = tf.placeholder(tf.float32, [1, lstm.state_size.pred])
        self.state_in = [c_in, h_in, pred_in]

        state_in = PredictionLSTMStateTuple(c_in, h_in, pred_in)
        lstm_outputs, lstm_state = tf.nn.dynamic_rnn(lstm,
                                                     x,
                                                     initial_state=state_in,
                                                     sequence_length=step_size,
                                                     time_major=False)
        lstm_c, lstm_h, lstm_pred = lstm_state
        x = tf.reshape(lstm_outputs, [-1, size])

        # Actor critic branch
        self.logits = linear(x, ac_space, "action",
                             normalized_columns_initializer(0.01))
        self.vf = tf.reshape(
            linear(x, 1, "value", normalized_columns_initializer(1.0)), [-1])
        self.state_out = [lstm_c[:1, :], lstm_h[:1, :], lstm_pred[:1, :]]
        self.sample = categorical_sample(self.logits, ac_space)[0, :]

        # Auxiliary branch
        self.predictions = tf.reshape(
            lstm_pred, shape=[-1, grid_size, grid_size, ac_space])
        self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                          tf.get_variable_scope().name)
        self.target_weights = []
Exemple #2
0
    def __init__(self, ob_space, ac_space, replay_size=2000, grid_size=20):
        self.x = x = tf.placeholder(tf.float32, [None] + list(ob_space))
        self.action = tf.placeholder(tf.float32, [None, ac_space])
        self.reward = tf.placeholder(tf.float32, [None, 1])
        self.bs = tf.placeholder(dtype=tf.int32)
        self.replay_memory = []
        self.replay_size = replay_size
        self.grid_size = grid_size

        self.prob = 1.
        self.final_prob = 0.1
        self.anneal_rate = .00000018

        self.num_actions = ac_space

        x = tf.nn.relu(conv2d(x, 16, "l1", [8, 8], [4, 4]))
        x = conv_features = tf.nn.relu(conv2d(x, 32, "l2", [4, 4], [2, 2]))
        x = flatten(x)
        x = tf.nn.relu(linear(x, 256, "l3", normalized_columns_initializer(0.1)))
        x = tf.concat(axis=1, values=[x, self.action, self.reward])
        x = tf.expand_dims(x, [0])

        size = 256
        lstm = GridPredictionLSTMCell(size, state_is_tuple=True, ac_space=ac_space,
                                       grid_size=20)
        self.state_size = lstm.state_size
        step_size = tf.shape(self.x)[:1]

        c_init = np.zeros((1, lstm.state_size.c), np.float32)
        h_init = np.zeros((1, lstm.state_size.h), np.float32)
        pred_init = np.zeros((1, lstm.state_size.pred), np.float32)
        self.state_init = [c_init, h_init, pred_init]
        c_in = tf.placeholder(tf.float32, [1, lstm.state_size.c])
        h_in = tf.placeholder(tf.float32, [1, lstm.state_size.h])
        pred_in = tf.placeholder(tf.float32, [1, lstm.state_size.pred])
        self.state_in = [c_in, h_in, pred_in]

        state_in = PredictionLSTMStateTuple(c_in, h_in, pred_in)
        lstm_outputs, lstm_state = tf.nn.dynamic_rnn(
            lstm, x, initial_state=state_in, sequence_length=step_size,
            time_major=False)
        lstm_c, lstm_h, lstm_pred = lstm_state

        # Q learning branch
        x = tf.reshape(lstm_outputs, [-1, size])
        self.Q = linear(x, ac_space, "action", normalized_columns_initializer(0.01))
        self.vf = tf.reduce_max(self.Q, axis=[1])

        self.state_out = [lstm_c[:1, :], lstm_h[:1, :], lstm_pred[:1, :]]

        # Auxiliary branch
        self.predictions = tf.reshape(lstm_pred, shape=[-1, grid_size, grid_size, ac_space])
        self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name)
        self.target_weights = []
Exemple #3
0
    def __init__(self, ob_space, ac_space, future_steps=5, pred_per_step=10, ckpt=None):
        self.ckpt_file = ckpt
        self.replay_memory = []
        self.future_steps = future_steps
        self.bs = tf.placeholder(dtype=tf.int32)
        self.num_actions = ac_space
        self.pred_per_step = pred_per_step
        self.num_pred = pred_per_step*future_steps
        self.use_target = tf.placeholder(dtype=tf.bool)
        self.default_last_prediction = tf.zeros(shape=[self.bs, self.num_pred])
        self.default_target_obs = tf.zeros(shape=[self.bs] + [self.future_steps] + list(ob_space))

        ## Inputs to the question network
        with tf.variable_scope("Question"):
            self.target_obs = tf.placeholder(tf.float32, [None] + [self.future_steps] + list(ob_space))
            feature_list = []
            for i in range(self.future_steps):
                obs = tf.squeeze(tf.slice(self.target_obs, begin=[0,i,0,0,0], size=[-1,1,-1,-1,-1]), axis=[1])
                z = tf.nn.relu(conv2d(obs, 16, "ql1"+str(i), [8, 8], [4, 4]))
                z = tf.nn.relu(conv2d(z, 32, "ql2"+str(i), [4, 4], [2, 2]))
                z = flatten(z)
                z = tf.nn.relu(linear(z, self.pred_per_step, "ql3" + str(i), normalized_columns_initializer(0.1)))
                feature_list.append(z)
            self.target_predictions = tf.concat(feature_list, axis=1)

        with tf.variable_scope("Main"):
            self.x = x = tf.placeholder(tf.float32, [None] + list(ob_space))
            self.action = tf.placeholder(tf.float32, [None, ac_space])
            self.reward = tf.placeholder(tf.float32, [None, 1])
            self.last_prediction = tf.placeholder(tf.float32, [None, self.num_pred])

            x = tf.nn.relu(conv2d(x, 16, "l1", [8, 8], [4, 4]))
            x = tf.nn.relu(conv2d(x, 32, "l2", [4, 4], [2, 2]))
            x = flatten(x)
            x = tf.nn.relu(linear(x, 256, "l3", normalized_columns_initializer(0.1)))


            p = tf.nn.l2_normalize(self.last_prediction, dim=1)
            p = tf.nn.tanh(linear(p, 256, 'encode_pred', normalized_columns_initializer(0.1)))

            xmain = tf.concat(axis=1, values=[x, self.action, self.reward])
            xaux = tf.concat(axis=1, values=[x, self.action, self.reward, p])

            xmain = tf.nn.relu(linear(xmain, 256, "l4", normalized_columns_initializer(0.1)))

            # Auxiliary branch
            y = tf.nn.relu(linear(xaux, 256, 'auxbranch_l1', normalized_columns_initializer(0.1)))
            self.approx_predictions = linear(y, self.num_pred, 'auxbranch_l2', normalized_columns_initializer(0.1))

            self.predictions = tf.where(self.use_target, self.target_predictions, self.approx_predictions)

            x = tf.concat(axis=1, values=[xmain, self.predictions])

            val = linear(x, 1, "value", normalized_columns_initializer(0.01))
            self.val = tf.reshape(val, shape=[-1])


        self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name)
        self.target_weights = []
    def __init__(self, ob_space, ac_space, replay_size=2000, grid_size=20, ckpt_file=None):
        self.x = x = tf.placeholder(tf.float32, [None] + list(ob_space))
        self.action = tf.placeholder(tf.float32, [None, ac_space])
        self.reward = tf.placeholder(tf.float32, [None, 1])
        self.pred = tf.placeholder(tf.float32, [None, grid_size, grid_size, ac_space])

        self.ac_space = ac_space
        self.bs = tf.placeholder(dtype=tf.int32)
        self.replay_memory = []
        self.replay_size = replay_size
        self.grid_size = grid_size

        self.prob = 1.
        self.final_prob = 0.1
        self.anneal_rate = .00000018

        self.num_actions = ac_space

        x = tf.nn.relu(conv2d(x, 16, "l1", [8, 8], [4, 4]))
        x = conv_features = tf.nn.relu(conv2d(x, 32, "l2", [4, 4], [2, 2]))
        x = flatten(x)
        pred = flatten(self.pred)
        x = tf.nn.relu(linear(x, 256, "l3", normalized_columns_initializer(0.1)))
        x = tf.concat(axis=1, values=[x, self.action, self.reward, self.pred])

        x = tf.nn.relu(linear(x, 256, "l4", normalized_columns_initializer(0.1)))

        self.Q = linear(x, ac_space, "action", normalized_columns_initializer(0.01))
        self.vf = tf.reduce_max(self.Q, axis=[1])

        # Auxiliary branch
        y = linear(x, 32*(self.grid_size-10)*(self.grid_size-10), 'auxbranch', normalized_columns_initializer(0.1))
        y = tf.reshape(y, shape=[-1, self.grid_size-10, self.grid_size-10, 32])
        deconv_weights = tf.get_variable("deconv" + "/w", [4, 4, ac_space, 32])
        self.predictions = tf.nn.conv2d_transpose(y, deconv_weights,
                                                output_shape=[1, self.grid_size, self.grid_size, self.ac_space],
                                                strides=[1,2,2,1], padding='SAME')

        self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name)
        self.target_weights = []
    def __init__(self,
                 ob_space,
                 ac_space,
                 mode="Grid",
                 replay_size=2000,
                 grid_size=20):
        self.x = x = tf.placeholder(tf.float32, [None] + list(ob_space))
        self.action = tf.placeholder(tf.float32, [None, ac_space])
        self.reward = tf.placeholder(tf.float32, [None, 1])
        self.replay_memory = []
        self.replay_size = replay_size
        self.grid_size = grid_size
        self.bs = tf.placeholder(dtype=tf.int32)

        x = tf.nn.relu(conv2d(x, 16, "l1", [8, 8], [4, 4], pad="VALID"))
        x = self.conv_features = tf.nn.relu(
            conv2d(x, 32, "l2", [4, 4], [2, 2], pad="VALID"))
        x = flatten(x)
        x = tf.nn.relu(
            linear(x, 256, "l3", normalized_columns_initializer(0.1)))
        x = tf.concat(concat_dim=1, values=[x, self.action, self.reward])
        # introduce a "fake" batch dimension of 1 after flatten so that we can do LSTM over time dim
        x = tf.expand_dims(x, [0])

        size = 256
        lstm = tf.nn.rnn_cell.BasicLSTMCell(size, state_is_tuple=True)
        self.state_size = lstm.state_size
        step_size = tf.shape(self.x)[:1]

        c_init = np.zeros((1, lstm.state_size.c), np.float32)
        h_init = np.zeros((1, lstm.state_size.h), np.float32)
        self.state_init = [c_init, h_init]
        c_in = tf.placeholder(tf.float32, [1, lstm.state_size.c])
        h_in = tf.placeholder(tf.float32, [1, lstm.state_size.h])
        self.state_in = [c_in, h_in]

        state_in = tf.nn.rnn_cell.LSTMStateTuple(c_in, h_in)
        lstm_outputs, lstm_state = tf.nn.dynamic_rnn(lstm,
                                                     x,
                                                     initial_state=state_in,
                                                     sequence_length=step_size,
                                                     time_major=False)
        lstm_c, lstm_h = lstm_state
        x = tf.reshape(lstm_outputs, [-1, size])

        # Actor critic branch
        self.logits = linear(x, ac_space, "action",
                             normalized_columns_initializer(0.01))
        self.vf = tf.reshape(
            linear(x, 1, "value", normalized_columns_initializer(1.0)), [-1])
        self.state_out = [lstm_c[:1, :], lstm_h[:1, :]]
        self.sample = categorical_sample(self.logits, ac_space)[0, :]

        # Auxiliary branch
        if mode == "Grid":
            y = linear(x, 32 * (grid_size - 7) * (grid_size - 7), 'auxbranch',
                       normalized_columns_initializer(0.1))
            y = tf.reshape(y, shape=[-1, grid_size - 7, grid_size - 7, 32])
            deconv_weights = weight_variable(shape=[4, 4, ac_space, 32],
                                             name='deconvweights')
            self.predictions = tf.nn.conv2d_transpose(
                y,
                deconv_weights,
                output_shape=[self.bs, grid_size, grid_size, ac_space],
                strides=[1, 2, 2, 1],
                padding='SAME')

        if mode == "Features":
            y = linear(x, 16 * 9 * 9, 'auxbranch',
                       normalized_columns_initializer(0.1))
            y = tf.reshape(y, shape=[-1, 9, 9, 16])
            deconv_weights = weight_variable(shape=[1, 1, 32 * ac_space, 16],
                                             name='deconvweights')
            predictions = tf.nn.conv2d_transpose(
                y,
                deconv_weights,
                output_shape=[self.bs, 9, 9, 32 * ac_space],
                strides=[1, 1, 1, 1],
                padding='SAME')
            self.predictions = tf.reshape(predictions,
                                          shape=[-1, 9, 9, 32, ac_space])

        self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                          tf.get_variable_scope().name)
        self.target_weights = []
Exemple #6
0
    def __init__(self, ob_space, ac_space, replay_size=2000, num_predictions=100):

        self.num_predictions = num_predictions

        with tf.variable_scope("Question"):
            self.qnet = BasicQQuestionNet(ob_space=ob_space, ac_space=ac_space)

        with tf.variable_scope("Answer"):
            self.anet = BasicQAnswerNet(ob_space=ob_space, ac_space=ac_space)

        with tf.variable_scope("Question", reuse=True):

            # Note: this observation sequence should be the flipped version of the original one
            self.x = x = tf.placeholder(tf.float32, [None] + list(ob_space))
            self.action = tf.placeholder(tf.float32, [None, ac_space])
            self.reward = tf.placeholder(tf.float32, [None, 1])
            self.bs = tf.placeholder(dtype=tf.int32)

            self.num_actions = ac_space

            x = tf.reverse(x, axis=[0])
            rev_action = tf.reverse(self.action, axis=[0])
            rev_reward = tf.reverse(self.reward, axis=[0])
            x=flatten(x)
            x = tf.concat(axis=1, values=[x, rev_action, rev_reward])
            x = tf.expand_dims(x, [0])

            size = 100 # number of predictions
            rnn = BasicRNNCell(num_units=size)
            self.pred_init = np.zeros((1, size), np.float32)
            self.pred_in = tf.placeholder(tf.float32, [1, size])

            step_size = tf.shape(self.x)[:1]

            rnn_output, rnn_state = tf.nn.dynamic_rnn(
                rnn, x, initial_state=self.pred_in, sequence_length=step_size,
                time_major=False)

            self.prediction_targs = tf.reshape(rnn_output, shape=[-1, size])

            # shape [1, size]. This will be the initial prediction in the answer network
            self.final_prediction_targs = tf.slice(self.prediction_targs, begin=[tf.shape(self.prediction_targs)[0]-1,0], size=[1,-1])

        with tf.variable_scope("Answer", reuse=True):
            x = self.x
            x = tf.nn.relu(conv2d(x, 16, "l1", [8, 8], [4, 4]))
            x = conv_features = tf.nn.relu(conv2d(x, 32, "l2", [4, 4], [2, 2]))
            x = flatten(x)
            x = tf.nn.relu(linear(x, 256, "l3", normalized_columns_initializer(0.1)))
            x = tf.concat(axis=1, values=[x, self.action, self.reward])
            x = tf.expand_dims(x, [0])

            size = 256
            lstm = BasicPredictionLSTMCell(num_units=256, state_is_tuple=True, num_pred=100)
            self.state_size = lstm.state_size
            step_size = tf.shape(self.x)[:1]

            c_init = np.zeros((1, lstm.state_size.c), np.float32)
            h_init = np.zeros((1, lstm.state_size.h), np.float32)
            pred_init = np.zeros((1, lstm.state_size.pred), np.float32)
            self.state_init = [c_init, h_init, pred_init]
            c_in = tf.placeholder(tf.float32, [1, lstm.state_size.c])
            h_in = tf.placeholder(tf.float32, [1, lstm.state_size.h])
            self.state_in = [c_in, h_in, self.final_prediction_targs]

            state_in = PredictionLSTMStateTuple(c_in, h_in, self.final_prediction_targs)
            lstm_outputs, lstm_state = tf.nn.dynamic_rnn(
                lstm, x, initial_state=state_in, sequence_length=step_size,
                time_major=False)
            lstm_c, lstm_h, lstm_pred = lstm_state

            # Q learning branch
            x = tf.reshape(lstm_outputs, [-1, size])
            self.Q = linear(x, ac_space, "action", normalized_columns_initializer(0.01))
            self.vf = tf.reduce_max(self.Q, axis=[1])

            self.state_out = [lstm_c[:1, :], lstm_h[:1, :], lstm_pred[:1, :]]

            # Auxiliary branch
            self.predictions = lstm_pred
            self.target_weights = []

        self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name)