예제 #1
0
    def __init__(self):
        self.device = '/gpu:0' if USE_GPU else '/cpu:0'
        self.stop_requested = False
        self.global_t = 0
        if USE_LSTM:
            self.global_network = A3CLSTMNetwork(STATE_DIM, STATE_CHN,
                                                 ACTION_DIM, self.device, -1)
        else:
            self.global_network = A3CFFNetwork(STATE_DIM, STATE_CHN,
                                               ACTION_DIM, self.device)

        self.initial_learning_rate = log_uniform(INITIAL_ALPHA_LOW,
                                                 INITIAL_ALPHA_HIGH,
                                                 INITIAL_ALPHA_LOG_RATE)
        self.learning_rate_input = tf.placeholder('float')
        self.optimizer = tf.train.RMSPropOptimizer(
            learning_rate=self.learning_rate_input,
            decay=RMSP_ALPHA,
            momentum=0.0,
            epsilon=RMSP_EPSILON)

        self.sess = tf.Session(config=tf.ConfigProto(
            log_device_placement=False, allow_soft_placement=True))

        self.reward_input = tf.placeholder(tf.float32)
        tf.scalar_summary('reward', self.reward_input)

        self.time_input = tf.placeholder(tf.float32)
        tf.scalar_summary('living_time', self.time_input)

        self.summary_op = tf.merge_all_summaries()
        self.summary_writer = tf.train.SummaryWriter(LOG_FILE, self.sess.graph)

        self.actor_threads = []
        for i in range(PARALLEL_SIZE):
            actor_thread = A3CActorThread(i, self.global_network,
                                          self.initial_learning_rate,
                                          self.learning_rate_input,
                                          self.optimizer, MAX_TIME_STEP,
                                          self.device)
            actor_thread.set_log_parmas(self.summary_writer, self.summary_op,
                                        self.reward_input, self.time_input)
            self.actor_threads.append(actor_thread)

        self.sess.run(tf.initialize_all_variables())
        self.saver = tf.train.Saver()
        self.restore()
        return
예제 #2
0
    def __init__(self, thread_index, global_network, initial_learning_rate,
                 learning_rate_input, optimizer, max_global_time_step, device):

        self.thread_index = thread_index
        self.learning_rate_input = learning_rate_input
        self.max_global_time_step = max_global_time_step

        if USE_LSTM:
            self.local_network = A3CLSTMNetwork(STATE_DIM, STATE_CHN,
                                                ACTION_DIM, device,
                                                thread_index)
        else:
            self.local_network = A3CFFNetwork(STATE_DIM, STATE_CHN, ACTION_DIM,
                                              device)
        self.local_network.create_loss(ENTROPY_BETA)
        self.trainer = AccumTrainer(device)
        self.trainer.create_minimize(self.local_network.total_loss,
                                     self.local_network.get_vars())
        self.accum_gradients = self.trainer.accumulate_gradients()
        self.reset_gradients = self.trainer.reset_gradients()

        clip_accum_grads = [
            tf.clip_by_norm(accum_grad, 40.0)
            for accum_grad in self.trainer.get_accum_grad_list()
        ]
        self.apply_gradients = optimizer.apply_gradients(
            zip(clip_accum_grads, global_network.get_vars()))

        self.sync = self.local_network.sync_from(global_network)

        self.local_t = 0
        self.initial_learning_rate = initial_learning_rate

        # for log
        self.episode_reward = 0.0
        self.episode_start_time = 0.0
        self.prev_local_t = 0

        # for pull mode, like brower based game
        self.states = []
        self.actions = []
        self.rewards = []
        self.values = []
        self.start_lstm_state = None
        return
예제 #3
0
    def __init__(self):
        self.device = '/gpu:0' if USE_GPU else '/cpu:0'
        self.stop_requested = False
        self.global_t = 0
        if USE_LSTM:
            self.global_network = A3CLSTMNetwork(STATE_DIM, STATE_CHN, ACTION_DIM, self.device, -1)
        else:
            self.global_network = A3CFFNetwork(STATE_DIM, STATE_CHN, ACTION_DIM, self.device)
        self.global_network.create_loss(ENTROPY_BETA)

        self.initial_learning_rate = log_uniform(INITIAL_ALPHA_LOW, INITIAL_ALPHA_HIGH, INITIAL_ALPHA_LOG_RATE)
        print 'initial_learning_rate:', self.initial_learning_rate
        self.learning_rate_input = tf.placeholder('float')
        self.optimizer = tf.train.RMSPropOptimizer(learning_rate=self.learning_rate_input,
                                                   decay=RMSP_ALPHA, momentum=0.0, epsilon=RMSP_EPSILON)

        grads_and_vars = self.optimizer.compute_gradients(
            self.global_network.total_loss, self.global_network.get_vars())
        self.apply_gradients = self.optimizer.apply_gradients(grads_and_vars)

        self.actor_threads = []
        for i in range(PARALLEL_SIZE):
            actor_thread = A3CActorThread(i, self.global_network)
            self.actor_threads.append(actor_thread)

        self.sess = tf.InteractiveSession()
        self.sess.run(tf.initialize_all_variables())

        self.reward_input = tf.placeholder(tf.float32)
        tf.scalar_summary('reward', self.reward_input)

        self.time_input = tf.placeholder(tf.float32)
        tf.scalar_summary('living_time', self.time_input)

        self.summary_op = tf.merge_all_summaries()
        self.summary_writer = tf.train.SummaryWriter(LOG_FILE, self.sess.graph)

        self.saver = tf.train.Saver()
        self.restore()

        self.lock = threading.Lock()
        self.rq = RedisQueue(REDIS_QUEUE_NAME)
        self.train_count = 0
        return
    def __init__(self, thread_index, global_network, initial_learning_rate,
                 learning_rate_input, optimizer, max_global_time_step, device):

        self.thread_index = thread_index
        self.learning_rate_input = learning_rate_input
        self.max_global_time_step = max_global_time_step

        if USE_LSTM:
            self.local_network = A3CLSTMNetwork(STATE_DIM, STATE_CHN,
                                                ACTION_DIM, device,
                                                thread_index)
        else:
            self.local_network = A3CFFNetwork(STATE_DIM, STATE_CHN, ACTION_DIM,
                                              device, thread_index)
        self.local_network.create_loss(ENTROPY_BETA)
        self.gradients = tf.gradients(self.local_network.total_loss,
                                      self.local_network.get_vars())

        clip_accum_grads = [
            tf.clip_by_norm(accum_grad, 10.0) for accum_grad in self.gradients
        ]
        self.apply_gradients = optimizer.apply_gradients(
            zip(clip_accum_grads, global_network.get_vars()))
        # self.apply_gradients = optimizer.apply_gradients(zip(self.gradients, global_network.get_vars()))

        self.sync = self.local_network.sync_from(global_network)

        self.game_state = GameState(thread_index)

        self.local_t = 0
        self.initial_learning_rate = initial_learning_rate

        # for log
        self.episode_reward = 0.0
        self.episode_start_time = 0.0
        self.prev_local_t = 0
        return