def __init__(self): self.device = '/gpu:0' if USE_GPU else '/cpu:0' self.stop_requested = False self.global_t = 0 if USE_LSTM: self.global_network = A3CLSTMNetwork(STATE_DIM, STATE_CHN, ACTION_DIM, self.device, -1) else: self.global_network = A3CFFNetwork(STATE_DIM, STATE_CHN, ACTION_DIM, self.device) self.initial_learning_rate = log_uniform(INITIAL_ALPHA_LOW, INITIAL_ALPHA_HIGH, INITIAL_ALPHA_LOG_RATE) self.learning_rate_input = tf.placeholder('float') self.optimizer = tf.train.RMSPropOptimizer( learning_rate=self.learning_rate_input, decay=RMSP_ALPHA, momentum=0.0, epsilon=RMSP_EPSILON) self.sess = tf.Session(config=tf.ConfigProto( log_device_placement=False, allow_soft_placement=True)) self.reward_input = tf.placeholder(tf.float32) tf.scalar_summary('reward', self.reward_input) self.time_input = tf.placeholder(tf.float32) tf.scalar_summary('living_time', self.time_input) self.summary_op = tf.merge_all_summaries() self.summary_writer = tf.train.SummaryWriter(LOG_FILE, self.sess.graph) self.actor_threads = [] for i in range(PARALLEL_SIZE): actor_thread = A3CActorThread(i, self.global_network, self.initial_learning_rate, self.learning_rate_input, self.optimizer, MAX_TIME_STEP, self.device) actor_thread.set_log_parmas(self.summary_writer, self.summary_op, self.reward_input, self.time_input) self.actor_threads.append(actor_thread) self.sess.run(tf.initialize_all_variables()) self.saver = tf.train.Saver() self.restore() return
def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, optimizer, max_global_time_step, device): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step if USE_LSTM: self.local_network = A3CLSTMNetwork(STATE_DIM, STATE_CHN, ACTION_DIM, device, thread_index) else: self.local_network = A3CFFNetwork(STATE_DIM, STATE_CHN, ACTION_DIM, device) self.local_network.create_loss(ENTROPY_BETA) self.trainer = AccumTrainer(device) self.trainer.create_minimize(self.local_network.total_loss, self.local_network.get_vars()) self.accum_gradients = self.trainer.accumulate_gradients() self.reset_gradients = self.trainer.reset_gradients() clip_accum_grads = [ tf.clip_by_norm(accum_grad, 40.0) for accum_grad in self.trainer.get_accum_grad_list() ] self.apply_gradients = optimizer.apply_gradients( zip(clip_accum_grads, global_network.get_vars())) self.sync = self.local_network.sync_from(global_network) self.local_t = 0 self.initial_learning_rate = initial_learning_rate # for log self.episode_reward = 0.0 self.episode_start_time = 0.0 self.prev_local_t = 0 # for pull mode, like brower based game self.states = [] self.actions = [] self.rewards = [] self.values = [] self.start_lstm_state = None return
def __init__(self): self.device = '/gpu:0' if USE_GPU else '/cpu:0' self.stop_requested = False self.global_t = 0 if USE_LSTM: self.global_network = A3CLSTMNetwork(STATE_DIM, STATE_CHN, ACTION_DIM, self.device, -1) else: self.global_network = A3CFFNetwork(STATE_DIM, STATE_CHN, ACTION_DIM, self.device) self.global_network.create_loss(ENTROPY_BETA) self.initial_learning_rate = log_uniform(INITIAL_ALPHA_LOW, INITIAL_ALPHA_HIGH, INITIAL_ALPHA_LOG_RATE) print 'initial_learning_rate:', self.initial_learning_rate self.learning_rate_input = tf.placeholder('float') self.optimizer = tf.train.RMSPropOptimizer(learning_rate=self.learning_rate_input, decay=RMSP_ALPHA, momentum=0.0, epsilon=RMSP_EPSILON) grads_and_vars = self.optimizer.compute_gradients( self.global_network.total_loss, self.global_network.get_vars()) self.apply_gradients = self.optimizer.apply_gradients(grads_and_vars) self.actor_threads = [] for i in range(PARALLEL_SIZE): actor_thread = A3CActorThread(i, self.global_network) self.actor_threads.append(actor_thread) self.sess = tf.InteractiveSession() self.sess.run(tf.initialize_all_variables()) self.reward_input = tf.placeholder(tf.float32) tf.scalar_summary('reward', self.reward_input) self.time_input = tf.placeholder(tf.float32) tf.scalar_summary('living_time', self.time_input) self.summary_op = tf.merge_all_summaries() self.summary_writer = tf.train.SummaryWriter(LOG_FILE, self.sess.graph) self.saver = tf.train.Saver() self.restore() self.lock = threading.Lock() self.rq = RedisQueue(REDIS_QUEUE_NAME) self.train_count = 0 return
def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, optimizer, max_global_time_step, device): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step if USE_LSTM: self.local_network = A3CLSTMNetwork(STATE_DIM, STATE_CHN, ACTION_DIM, device, thread_index) else: self.local_network = A3CFFNetwork(STATE_DIM, STATE_CHN, ACTION_DIM, device, thread_index) self.local_network.create_loss(ENTROPY_BETA) self.gradients = tf.gradients(self.local_network.total_loss, self.local_network.get_vars()) clip_accum_grads = [ tf.clip_by_norm(accum_grad, 10.0) for accum_grad in self.gradients ] self.apply_gradients = optimizer.apply_gradients( zip(clip_accum_grads, global_network.get_vars())) # self.apply_gradients = optimizer.apply_gradients(zip(self.gradients, global_network.get_vars())) self.sync = self.local_network.sync_from(global_network) self.game_state = GameState(thread_index) self.local_t = 0 self.initial_learning_rate = initial_learning_rate # for log self.episode_reward = 0.0 self.episode_start_time = 0.0 self.prev_local_t = 0 return