Esempio n. 1
0
    def __init__(self, learning_rate, memory_size, batch_size, sess,
                 output_size):
        self.sess = sess

        #state_t
        self.encoder_input = tf.placeholder(tf.float32,
                                            shape=[None, n_features],
                                            name='encoder_input')
        self.encoder_output = mlp(inputs=self.encoder_input,
                                  n_output=output_size,
                                  scope='encoder_output',
                                  hiddens=[32, 16, 8])
        self.decoder_output = mlp(inputs=self.encoder_output,
                                  n_output=n_features,
                                  scope='decoder_output',
                                  hiddens=[8, 16, 32])
        self.encoder_output_ = tf.stop_gradient(self.decoder_output)

        #some const
        self.learning_rate = learning_rate
        self.memory_size = memory_size
        self.batch_size = batch_size

        #memory
        self.memory = Memory(self.memory_size)

        #for train
        self.loss = tf.reduce_mean(
            tf.squared_difference(self.encoder_input, self.decoder_output))
        self.train = tf.train.AdamOptimizer(self.learning_rate).minimize(
            self.loss)
    def __init__(
            self,
            learning_rate,
            memory_size,
            batch_size,
            sess,
            output_size
    ):
        self.sess = sess

        #state_t
        self.encoder_input_t = tf.placeholder(tf.float32, shape=[None, n_features], name='encoder_input_t')

        self.encoder_output_t = mlp(inputs=self.encoder_input_t, n_output=output_size, scope='encoder_output_t',
                                    hiddens=[16, 8])
        self.encoder_output_t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='encoder_output_t')

        self.decoder_output_t = mlp(inputs=self.encoder_output_t, n_output=n_features, scope='decoder_output_t')
        self.decoder_output_t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='decoder_output_t')

        self.encoder_output_t_ = tf.stop_gradient(self.encoder_output_t)

        #state_t+1  tpo->time plus one
        self.encoder_input_tpo = tf.placeholder(tf.float32, shape=[None, n_features], name='encoder_input_tpo')

        self.encoder_output_tpo = mlp(inputs=self.encoder_input_tpo, n_output=output_size, scope='encoder_output_tpo',
                                    hiddens=[16, 8])
        self.encoder_output_tpo_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='encoder_output_tpo')

        self.decoder_output_tpo = mlp(inputs=self.encoder_output_tpo, n_output=n_features, scope='decoder_output_tpo')
        self.decoder_output_tpo_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='decoder_output_tpo')

        self.encoder_output_tpo_ = tf.stop_gradient(self.encoder_output_tpo)

        #sync
        self.sync_encoder = [tf.assign(x, y) for x, y in zip(self.encoder_output_t_params, self.encoder_output_tpo_params)]
        self.sync_decoder = [tf.assign(x, y) for x, y in zip(self.decoder_output_t_params, self.decoder_output_tpo_params)]

        #some const
        self.learning_rate = learning_rate
        self.memory_size = memory_size
        self.batch_size = batch_size

        #memory
        self.memory = Memory(self.memory_size)

        #for train
        self.loss_0 = tf.reduce_mean(tf.squared_difference(self.encoder_input_t, self.decoder_output_t))
        self.loss_1 = tf.reduce_mean(tf.squared_difference(self.encoder_input_tpo,self.decoder_output_tpo))

        self.train_0 = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss_0)
        self.train_1 = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss_1)
Esempio n. 3
0
    def __init__(
            self,
            learning_rate,
            memory_size,
            batch_size,
            sess
    ):
        self.sess = sess
        self.common_encoder_input = tf.placeholder(tf.float32, shape=[None, n_features], name='common_encoder_input')
        self.common_encoder_output = mlp(inputs=self.common_encoder_input, n_output=n_features, scope='common_encoder_output',
                                    hiddens=[16, 8])
        self.common_decoder_output = mlp(inputs=self.common_encoder_output, n_output=n_features, scope='common_decoder_output')

        self.learning_rate = learning_rate
        self.memory_size = memory_size
        self.batch_size = batch_size

        self.memory = Memory(self.memory_size)

        self.loss = tf.reduce_mean(tf.squared_difference(self.common_encoder_input, self.common_decoder_output))
        self.train = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss)
Esempio n. 4
0
                intra_op_parallelism_threads=1,
            ))
    # set saver
    if SAVE:
        saver = tf.train.Saver()
    if LOAD:
        model_file = tf.train.latest_checkpoint(LOAD_FILE_PATH)
        saver.restore(sess, model_file)

    if RESULT_EXPORT:
        f = open('/~/result.txt', 'w')

    #try to share some common layers
    common_eval_input = tf.placeholder(tf.float32, shape=[None, n_features], name='common_eval_input')
    common_target_input = tf.placeholder(tf.float32, shape=[None, n_features], name='common_target_input')
    common_eval_output = mlp(inputs=common_eval_input, n_output=64, scope='common_eval_layer', hiddens=hiddens)
    common_target_output = tf.stop_gradient(mlp(inputs=common_eval_input, n_output=64, scope='common_target_layer', hiddens=hiddens))

    #initialize the plot
    # fig = plt.figure()
    # ax = fig.add_subplot(1,1,1)
    # ax.axis("equal")
    # plt.ion()
    # plt.ylim((0,10))
    # x= [0]
    # y= [0]

    #add agents
    ais = []
    for i in range(ai_number):
        ais.append(DQN(
Esempio n. 5
0
    def __init__(self,
                 n_features,
                 n_actions,
                 model,
                 scope,
                 sess,
                 order,
                 hiddens,
                 beta,
                 C,
                 common_eval_input,
                 common_target_input,
                 common_eval_output,
                 common_target_output,
                 learning_rate=1e-5,
                 decay=0.99,
                 memory_size=20000000,
                 batch_size=100000,
                 epsilon_decrement=0.0005,
                 epsilon_lower=0.2):
        self.sess = sess
        self.scope = scope
        self.n_features = n_features
        self.batch_size = batch_size
        self.decay = decay
        self.model = model
        self.memory = Memory(memory_size)
        self.order = order
        self.beta = beta
        self.C = C

        self.learn_times = 0

        self.epsilon_lower = epsilon_lower
        self.epsilon_decrement = epsilon_decrement

        self.eval_input = tf.placeholder(tf.float32,
                                         shape=[None, self.n_features],
                                         name='eval_input')
        self.target_input = tf.placeholder(tf.float32,
                                           shape=[None, self.n_features],
                                           name='target_input')
        self.actions_selected = tf.placeholder(tf.int32,
                                               shape=[
                                                   None,
                                               ],
                                               name='actions_selected')
        self.done = tf.placeholder(tf.float32, shape=[
            None,
        ], name='done')
        self.decays = tf.placeholder(tf.float32, shape=[
            None,
        ], name='decay')
        self.rewards = tf.placeholder(tf.float32,
                                      shape=[
                                          None,
                                      ],
                                      name='rewards')

        #about the encoder
        self.state_input_t = tf.placeholder(tf.float32,
                                            shape=[None, self.n_features],
                                            name='state_input_t')
        self.state_input_tpo = tf.placeholder(tf.float32,
                                              shape=[None, self.n_features],
                                              name='state_input_tpo')
        self.action_plus_state_input = tf.placeholder(
            tf.float32,
            shape=[None, self.n_features + 1],
            name='action_plus_state_input')

        #share the first layers
        self.common_eval_input = common_eval_input
        self.common_target_input = common_target_input
        self.common_eval_output = common_eval_output
        self.common_target_output = common_target_output

        with tf.variable_scope(self.scope):
            self._epsilon = tf.get_variable(name='epsilon',
                                            dtype=tf.float32,
                                            initializer=1.0)
            self._epsilon_decrement = tf.constant(epsilon_decrement)
            self.update_epsilon = tf.assign(
                self._epsilon, self._epsilon - self._epsilon_decrement)
            self.reset_epsilon = tf.assign(self._epsilon, 1)

            # self.eval_output = model(inputs=self.eval_input, n_output=n_actions, scope='eval_net', hiddens=hiddens)
            # self.target_output = tf.stop_gradient(
            #     model(inputs=self.target_input, n_output=n_actions, scope='target_net', hiddens=hiddens))

            self.eval_output = model(inputs=self.common_eval_output,
                                     n_output=n_actions,
                                     scope='eval_net',
                                     hiddens=hiddens)
            self.target_output = tf.stop_gradient(
                model(inputs=self.common_target_output,
                      n_output=n_actions,
                      scope='target_net',
                      hiddens=hiddens))

            #about encoder
            self.encoder_temp_t = mlp(inputs=self.state_input_t,
                                      n_output=64,
                                      scope='encoder_temp_t',
                                      hiddens=[32, 64])
            self.encoder_temp_tpo = tf.stop_gradient(
                mlp(inputs=self.state_input_tpo,
                    n_output=64,
                    scope='encoder_temp_tpo',
                    hiddens=[32, 64]))

            self.encoder_output_t = mlp(inputs=self.encoder_temp_t,
                                        n_output=self.n_features,
                                        scope='encoder_t',
                                        hiddens=[64, 32])
            self.encoder_output_tpo = mlp(inputs=self.encoder_temp_tpo,
                                          n_output=self.n_features,
                                          scope='encoder_tpo',
                                          hiddens=[64, 32])
            self.predict_output = mlp(inputs=self.action_plus_state_input,
                                      n_output=64,
                                      scope='predict_output',
                                      hiddens=[64, 32])

            self.predict_mse = tf.reduce_sum(
                tf.square(self.encoder_temp_tpo -
                          self.predict_output)) * self.n_features
            self.emax = tf.get_variable(name='emax',
                                        dtype=tf.float32,
                                        initializer=1.0)
            self.update_emax = tf.assign(
                self.emax, tf.maximum(self.emax, self.predict_mse))
            self.e_normalize = tf.div(self.predict_mse, self.emax)

            self.encoder_loss = tf.reduce_sum(
                tf.square(self.state_input_t - self.encoder_output_t))
            self.train_encoder = tf.train.AdamOptimizer(
                learning_rate).minimize(self.encoder_loss)
            self.M_loss = self.predict_mse
            self.train_M = tf.train.AdamOptimizer(learning_rate).minimize(
                self.M_loss)

        self.eval_output_selected = tf.reduce_sum(
            self.eval_output * tf.one_hot(self.actions_selected, n_actions),
            axis=1)
        self.eval_output_target = self.rewards + self.decays * tf.reduce_max(
            self.target_output, axis=1) * (1. - self.done)

        self.loss = tf.reduce_mean(
            tf.squared_difference(self.eval_output_selected,
                                  self.eval_output_target))
        self.train = tf.train.AdamOptimizer(learning_rate).minimize(self.loss)

        self.eval_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                             scope=scope + '/eval_net')
        self.target_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                               scope=scope + '/target_net')

        self.update = [
            tf.assign(x, y)
            for x, y in zip(self.target_params, self.eval_params)
        ]

        self.sess.run(tf.global_variables_initializer())
def main():

    model_choices = ["atari_deepmind",
                     "cnn_to_lstm",
                     "mlp",
                     "lstm_to_mlp",
                     "cnn_to_lstm_new"]
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("--env", help="environment ID", default="BreakoutNoFrameskip-v4")
    parser.add_argument("--seed", help="RNG seed", type=int, default=0)
    parser.add_argument("--prioritized", type=int, default=1)
    parser.add_argument("--dueling", type=int, default=1)
    parser.add_argument("--num-timesteps", type=int, default=int(5*10e2))
    parser.add_argument("--learning-rate", type=float, default=1e-4)
    parser.add_argument("--batch-size", type=int, default=1)
    parser.add_argument("--buffer-size", type=int, default=int(1e6))
    parser.add_argument("--exploration_steps", type=float, default=1e6)
    parser.add_argument("--exploration_final_eps", type=float, default=0.1)
    parser.add_argument("--train-freq", type=int, default=4)
    parser.add_argument("--learning-starts", type=int, default=int(1e4))
    parser.add_argument("--target_network_update_freq", type=int, default=int(1e3))
    parser.add_argument("--gamma", type=float, default=0.99)
    parser.add_argument("--model", type=str, choices = model_choices, default="cnn_to_lstm_new")
    args = parser.parse_args()

    logger.configure(log_dir)

    set_global_seeds(args.seed)
    env = make_atari(args.env)
    env = bench.Monitor(env, logger.get_dir())
    env = deepq.wrap_atari_dqn(env)

    if args.model == "mlp":
        model = mlp(hiddens=[256, 256])
    elif args.model == "atari_deepmind":
        model = cnn_to_mlp(convs=[(16, 8, 4), (32, 4, 2)],
                                                hiddens=[256],
                                                duelings=bool(args.dueling))
    elif args.model == "cnn_to_lstm":
        model = cnn_to_lstm(convs=[(16, 8, 4), (32, 4, 2)],
                            lstm_hidden_size=512,
                            lstm_out_size=256,
                            hiddens=[256, 128],
                            batch_size=int(args.batch_size),
                            duelings=bool(args.dueling))
    elif args.model == "cnn_to_lstm_new":
        model = cnn_to_lstm_new(convs=[(16, 8, 4), (32, 4, 2)],
                            lstm_hidden_size=512,
                            lstm_out_size=256,
                            hiddens=[256, 128],
                            batch_size=int(args.batch_size),
                            duelings=bool(args.dueling))
    elif args.model == "lstm_to_mlp":
        model = lstm_to_mlp(lstm_hidden_size=512,
                            lstm_out_size=256,
                            hiddens=[256, 128],
                            batch_size=int(args.batch_size),
                            duelings=bool(args.dueling))

    act = deepq.learn(
        env,
        q_func=model,
        lr=args.learning_rate,
        max_timesteps=args.num_timesteps,
        buffer_size=int(args.buffer_size),
        exploration_fraction=(args.exploration_steps / args.num_timesteps),
        exploration_final_eps=args.exploration_final_eps,
        train_freq=args.train_freq,
        batch_size=int(args.batch_size),
        learning_starts=int(args.learning_rate),
        target_network_update_freq=int(args.target_network_update_freq),
        gamma=args.gamma,
        prioritized_replay=bool(args.prioritized)
    )

    f = open(os.path.join(log_dir, "README.me"), "w")
    f.write("\tenv \t{}\n".format(args.env))
    f.write("\tmodel\t{}\n".format(args.model))
    f.write("\tprioritized\t{}\n".format(args.prioritized))
    f.write("\tdueling\t{}\n".format(args.dueling))
    f.write(("\tlearning rate\t{}\n".format(args.learning_rate)))
    f.write(("\tbatch size\t{}\n").format(args.batch_size))
    f.write("\tmax timestep\t{}\n".format(args.num_timesteps))
    f.write("\tbuffer size\t{}\n".format(args.buffer_size))
    f.write("\texploration fraction\t{}\n".format(args.exploration_steps/args.num_timesteps))
    f.write("\texploration_final_eps\t{}\n".format(args.exploration_final_eps))
    f.write("\ttrain freq\t{}\n".format(args.train_freq))
    f.write("\tlearning starts\t{}\n".format(args.learning_rate))
    f.write("\ttarget network update freq\t{}\n".format(args.target_network_update_freq))
    f.close()
    act.save("log/{}.pkl".format(args.model + "_" + args.env.replace(" ", "_")))