class Generator(tf.keras.Model): def __init__(self, random_noise_size=100): super().__init__(name='generator') #layers init = RandomNormal(stddev=0.2) self.dense_1 = Dense(7 * 7 * 256, use_bias=False, input_shape=(random_noise_size, )) self.batchNorm1 = BatchNormalization() self.leaky_1 = LeakyReLU(alpha=0.2) self.reshape_1 = Reshape((7, 7, 256)) self.up_2 = UpSampling2D((1, 1), interpolation='nearest') self.conv2 = Conv2D(128, (3, 3), strides=(1, 1), padding="same", use_bias=False, kernel_initializer=init) self.batchNorm2 = BatchNormalization() self.leaky_2 = LeakyReLU(alpha=0.2) self.up_3 = UpSampling2D((2, 2), interpolation='nearest') self.conv3 = Conv2D(64, (3, 3), strides=(1, 1), padding="same", use_bias=False, kernel_initializer=init) self.batchNorm3 = BatchNormalization() self.leaky_3 = LeakyReLU(alpha=0.2) self.up_4 = UpSampling2D((2, 2), interpolation='nearest') self.conv4 = Conv2D(1, (3, 3), activation='tanh', strides=(1, 1), padding="same", use_bias=False, kernel_initializer=init) self.optimizer = RMSprop(lr=0.00005) def call(self, input_tensor): ## Definition of Forward Pass x = self.reshape_1( self.leaky_1(self.batchNorm1(self.dense_1(input_tensor)))) x = self.leaky_2(self.batchNorm2(self.conv2(self.up_2(x)))) x = self.leaky_3(self.batchNorm3(self.conv3(self.up_3(x)))) return self.conv4(self.up_4(x)) def generate_noise(self, batch_size, random_noise_size): return tf.random.normal([batch_size, random_noise_size]) def compute_loss(self, y_true, y_pred, class_wanted, class_y): """ Wasserstein loss - prob of classfier get it right """ #return tf.math.subtract(backend.mean(y_true * y_pred),categorical_crossentropy(class_wanted,class_y)) return backend.mean(y_true * y_pred) - categorical_crossentropy( class_wanted, class_y) def backPropagate(self, gradients, trainable_variables): self.optimizer.apply_gradients(zip(gradients, trainable_variables))
class Critic(tf.keras.Model): def __init__(self): super().__init__(name="critic") init = RandomNormal(stddev=0.2) #Layers self.conv_1 = Conv2D(64, (4, 4), strides=(2, 2), padding='same', kernel_initializer=init, input_shape=[28, 28, 1]) self.leaky_1 = LeakyReLU(alpha=0.2) self.dropout_1 = Dropout(0.3) self.conv_2 = Conv2D(128, (4, 4), strides=(2, 2), padding='same', kernel_initializer=init) self.leaky_2 = LeakyReLU(alpha=0.2) self.dropout_2 = Dropout(0.3) self.flat = Flatten() self.logits = Dense( 1) # This neuron tells us if the input is fake or real self.optimizer = RMSprop(lr=0.00005) def call(self, input_tensor): ## Definition of Forward Pass x = self.dropout_1(self.leaky_1(self.conv_1(input_tensor))) x = self.dropout_2(self.leaky_2(self.conv_2(x))) x = self.flat(x) return self.logits(x) def compute_loss(self, y_true, y_pred, grad_p): """ Wasserstein loss """ lambda_ = 10.0 return backend.mean(y_true * y_pred) + (lambda_ * grad_p) def backPropagate(self, gradients, trainable_variables): self.optimizer.apply_gradients(zip(gradients, trainable_variables))
class Network: def __init__(self, input_shape, output_shape): self.input_shape = input_shape self.output_shape = output_shape self.optimizer = RMSprop(0.01) # self.optimizer = SGD(0.001, momentum=0.9) self.model = self._build_model() def loss(self, inputs, targets): y = self.model(inputs) cross_entropy_loss = -tf.reduce_sum(targets * tf.math.log(y + 1e-6), axis=1) loss = tf.reduce_mean(cross_entropy_loss) return loss def accuracy(self, inputs, targets): y = self.model(inputs) acc = tf.cast( tf.equal(tf.argmax(targets, axis=1), tf.argmax(y, axis=1)), tf.float32) acc = tf.reduce_sum(acc) p = acc / y.shape[0] return p def train(self, inputs, targets): with tf.GradientTape() as tape: loss = self.loss(inputs, targets) grads = tape.gradient(loss, self.model.trainable_variables) grads_and_vars = zip(grads, self.model.trainable_variables) self.optimizer.apply_gradients(grads_and_vars) return loss def _build_model(self): input_x = Input(shape=(self.input_shape, )) x = Dense(64, activation=relu)(input_x) x = Dense(64, activation=relu)(x) output = Dense(self.output_shape, activation=softmax)(x) model = Model(inputs=input_x, outputs=[output]) return model
class MainAgent: def __init__(self, name='agent', reward_weights=None): self.reward = 0 self.episode = 0 self.name = name # Default reward weights self.reward_weights = { 'enemy_killed_value': 1, 'friendly_killed_value': 1, 'killed_value': 1, 'damage_taken': 1, 'damage_given': 1, 'damage': 1, 'outcome': 1, } if reward_weights: self.reward_weights.update(reward_weights) self.last_obs = None self.recorder = [] self.model = self.build_model(SCREEN_SIZE, SCREEN_SIZE, SCREEN_DEPTH, UNIT_TENSOR_LENGTH, len(ACTION_OPTIONS)) self.opt = RMSprop(lr=LEARNING_RATE) # How to convert blizzard unit and building IDs to our subset of units def convert_unit_ids(x): if x in UNIT_OPTIONS: return (UNIT_OPTIONS.index(x) + 1.) / len(UNIT_OPTIONS) return 0. self.convert_unit_ids = convert_unit_ids self.convert_unit_ids_vect = np.vectorize(convert_unit_ids) # How to convert 'player_relative' data def convert_player_ids(x): if x == 1: # Self return 1. elif x == 4: # Enemy return -1. else: # Background usually return 0. self.convert_player_ids = convert_player_ids self.convert_player_ids_vect = np.vectorize(convert_player_ids) def reset(self): self.recorder = [Episode()] self.episode = 0 def next_episode(self): self.episode += 1 self.recorder.append(Episode()) self.last_obs = None # Train model with recorded game data def train(self): loss = np.array([0., 0., 0., 0.]) for ep in self.recorder: loss += self._train( ep.screen_input[:ep.current_step], ep.action_input[:ep.current_step], ep.unit_input[:ep.current_step], get_discounted_rewards(ep.rewards[:ep.current_step], discount_rate=DISCOUNT_RATE), ep.nonspatial_action[:ep.current_step], ep.spatial_action[:ep.current_step], ep.screen_used[:ep.current_step]) return loss / len(self.recorder) def _train(self, screens_input, action_input, select_input, reward, action, screen_action, screen_used): _entropy = _policy_loss = _value_loss = 0. with tf.GradientTape() as tape: spatial_policy, ns_policy, value = self.model( [screens_input, action_input, select_input]) value = K.squeeze(value, axis=1) ns_action_one_hot = K.one_hot(action, len(ACTION_OPTIONS)) screen_action_one_hot = K.one_hot(screen_action, SCREEN_SIZE * SCREEN_SIZE) value_loss = .5 * K.square(reward - value) entropy = -K.sum(ns_policy * K.log(ns_policy + 1e-10), axis=1) - \ K.sum(spatial_policy * K.log(spatial_policy + 1e-10), axis=1) ns_log_prob = K.log( K.sum(ns_policy * ns_action_one_hot, axis=1) + 1e-10) spatial_log_prob = K.log( K.sum(spatial_policy * screen_action_one_hot, axis=1) + 1e-10) advantage = reward - K.stop_gradient(value) # Mask out spatial_log_prob when the action taken did not use the screen policy_loss = -(ns_log_prob + spatial_log_prob * screen_used) * advantage - entropy * ENTROPY_RATE total_loss = policy_loss + value_loss _entropy = K.mean(entropy) _policy_loss = K.mean(K.abs(policy_loss)) _value_loss = K.mean(value_loss) gradients = tape.gradient(total_loss, self.model.trainable_variables) global_norm = tf.linalg.global_norm(gradients) print(tf.linalg.global_norm(gradients)) gradients, _ = tf.clip_by_global_norm( gradients, GRADIENT_CLIP_MAX) # Prevents exploding gradients...I think self.opt.apply_gradients(zip(gradients, self.model.trainable_variables)) return [ float(_value_loss), float(_policy_loss), float(_entropy), global_norm ] def strip_reshape(self, arr): return np.reshape(arr, tuple(s for s in arr.shape if s > 1)) # Call with game end step and the outcome from the environment def step_end(self, obs, outcome): last_reward = self.calc_reward(obs, self.last_obs, outcome=outcome) self.recorder[self.episode].reward_last_step(last_reward) # Takes a state and returns an action, also updates step information def step(self, obs, training=True): episode = self.recorder[self.episode] if self.last_obs: last_reward = self.calc_reward(obs, self.last_obs) episode.reward_last_step(last_reward) screens_input, action_input, select_input = self.build_inputs_from_obs( obs) spatial_action_policy, ns_action_policy, value = self.model( [screens_input, action_input, select_input]) # Remove dimensions with length 1 spatial_action_policy = self.strip_reshape(spatial_action_policy) ns_action_policy = self.strip_reshape(ns_action_policy) if training: try: screen_choice = np.random.choice(SCREEN_SIZE * SCREEN_SIZE, p=spatial_action_policy / np.sum(spatial_action_policy)) except Exception as e: print('Error in %s' % self.name) raise else: screen_choice = np.argmax(spatial_action_policy) screen_x = screen_choice // SCREEN_SIZE screen_y = screen_choice % SCREEN_SIZE if training: # Select from probability distribution choice = np.random.choice(len(ns_action_policy), p=ns_action_policy) else: # Select highest probability choice = int(np.argmax(ns_action_policy)) action = ACTION_OPTIONS[choice] build_args = [] # Build action for arg in action['args']: if arg == 'screen': build_args.append([screen_x, screen_y]) elif arg == 'screen_rect': build_args.append([ np.max([(screen_x - SELECT_SIZE), 0]), np.max([(screen_y - SELECT_SIZE), 0]) ]) build_args.append([ np.min([(screen_x + SELECT_SIZE), SCREEN_SIZE - 1]), np.min([(screen_y + SELECT_SIZE), SCREEN_SIZE - 1]) ]) elif type(arg) is int: build_args.append([arg]) else: raise KeyError('Unrecognized function argument: %s' % arg) self.recorder[self.episode].save_step( (screens_input, action_input, select_input), (spatial_action_policy, ns_action_policy, value), choice, screen_choice, ('screen' in action['args'] or 'screen_rect' in action['args'])) self.last_obs = obs return actions.FunctionCall(action['id'], build_args) def build_inputs_from_obs(self, obs): screens_input = np.zeros((SCREEN_DEPTH, SCREEN_SIZE, SCREEN_SIZE), dtype=np.float32) # Transpose feature screens because spatial observations are (y,x) coordinates, everything else is (x,y) for ndx, name in enumerate(INPUT_SCREENS): if name == 'player_relative': screens_input[ndx] = self.convert_player_ids_vect( np.array(obs.observation['feature_screen'][name])) elif name == 'unit_type': unit_types = np.array(obs.observation['feature_screen'][name]) screens_input[ndx] = self.convert_unit_ids_vect(unit_types) elif name == 'unit_hit_points': screens_input[ndx] = np.array( obs.observation['feature_screen'][name]) / UNIT_HP_SCALE else: screens_input[ndx] = np.array( obs.observation['feature_screen'][name]) / getattr( features.SCREEN_FEATURES, name).scale screens_input = np.reshape(screens_input, (1, SCREEN_SIZE, SCREEN_SIZE, SCREEN_DEPTH)) # Available actions as array of 1 and 0 action_input = np.array( [(0. if act_info['id'] not in obs.observation['available_actions'] or (act_info['id'] == actions.FUNCTIONS.select_unit.id and act_info['args'][1] >= len(obs.observation['multi_select'])) else 1.) for act_info in ACTION_OPTIONS], dtype=np.float32) action_input = np.reshape(action_input, (1, len(ACTION_OPTIONS))) # Normalizes the unit select tensor and removes fields def convert_select_tensor(x): return np.array([ self.convert_unit_ids(x[0]), self.convert_player_ids(x[1]), x[2] / UNIT_HP_SCALE ], dtype=np.float32) # Selected units select_input = np.zeros((MAX_UNIT_SELECT, UNIT_TENSOR_LENGTH), dtype=np.float32) for ndx, unit in enumerate(obs.observation['multi_select']): select_input[ndx] = convert_select_tensor(unit) select_input = np.reshape(select_input, (1, MAX_UNIT_SELECT * UNIT_TENSOR_LENGTH)) return screens_input, action_input, select_input def calc_reward(self, obs, obs_prev, outcome=0.): rw = self.reward_weights score = obs.observation['score_by_category'] score_prev = obs_prev.observation['score_by_category'] # Difference in army killed minerals and vespene cost minus diff in lost minerals and vespene since last state enemy_killed_value = (score[1][1] - score_prev[1][1]) + VESPENE_SCALING * ( score[2][1] - score_prev[2][1]) friendly_killed_value = (score[3][1] - score_prev[3][1]) + VESPENE_SCALING * ( score[4][1] - score_prev[4][1]) diff_value = rw['enemy_killed_value'] * enemy_killed_value - rw[ 'friendly_killed_value'] * friendly_killed_value score = obs.observation['score_by_vital'] score_prev = obs_prev.observation['score_by_vital'] # Difference in damage dealt minus damage taken since last state damage_given = score[0][0] - score_prev[0][0] damage_taken = score[1][0] - score_prev[1][0] diff_damage = rw['damage_given'] * damage_given - rw[ 'damage_taken'] * damage_taken reward = .005 * rw['killed_value'] * diff_value + .01 * rw[ 'damage'] * diff_damage + rw['outcome'] * outcome * .5 return reward def build_model(self, screen_width, screen_height, screen_depth, select_input_length, action_size, training=True): K.set_floatx('float32') # Inputs screen_input = Input(shape=(screen_width, screen_height, screen_depth), dtype='float32') action_input = Input(shape=(action_size, ), dtype='float32') select_input = Input(shape=(MAX_UNIT_SELECT * select_input_length, ), dtype='float32') screen_part = TimeDistributed( Conv2D(screen_depth, 5, strides=1, padding='same'))(screen_input) screen_part = TimeDistributed(BatchNormalization())(screen_part) screen_part = TimeDistributed(Activation('relu'))(screen_part) screen_part = TimeDistributed( Conv2D(screen_depth, 3, strides=1, padding='same'))(screen_part) screen_part = TimeDistributed(BatchNormalization())(screen_part) screen_part = TimeDistributed(Activation('relu'))(screen_part) action_1 = TimeDistributed( Dense(screen_width * screen_height, use_bias=True, activation='relu', name='ingrid'))(action_input) action_1 = TimeDistributed(Reshape( (screen_width, screen_height, 1)))(action_1) select_1 = TimeDistributed( Dense(screen_width * screen_height, use_bias=True, activation='relu', name='steve'))(select_input) select_1 = TimeDistributed(Reshape( (screen_width, screen_height, 1)))(select_1) core = TimeDistributed( Concatenate(axis=3)([screen_part, action_1, select_1])) core = ConvLSTM2D(1, 5, strides=1, padding='same', activation='relu', training=training)(core) # core = Conv2D(10, 5, strides=1, padding='same')(core) # core = BatchNormalization()(core) # core = Activation('relu')(core) # core = Conv2D(4, 5, strides=1, padding='same')(core) # core = BatchNormalization()(core) # core = Activation('relu')(core) action_policy = TimeDistributed( Conv2D(1, 3, strides=2, padding='same', activation='relu'))(core) action_policy = TimeDistributed(Flatten())(action_policy) action_policy = TimeDistributed( Dense(action_size * 2, use_bias=True, activation='relu'))(action_policy) if training: action_policy = TimeDistributed( Dropout(DROPOUT_RATE))(action_policy) action_policy = TimeDistributed( Dense(action_size * 2, use_bias=True, activation='relu'))(action_policy) if training: action_policy = TimeDistributed( Dropout(DROPOUT_RATE))(action_policy) action_policy = TimeDistributed(Dense(action_size))(action_policy) # Mask out unavailable actions and softmax action_policy = K.exp(action_policy) * action_input / (K.sum( K.exp(action_policy) * action_input)) value = TimeDistributed(Conv2D(1, 5, strides=3, activation='relu'))(core) value = TimeDistributed(Flatten())(value) if training: value = TimeDistributed(Dropout(DROPOUT_RATE))(value) value = TimeDistributed(Dense(50, use_bias=True, activation='relu'))(value) value = TimeDistributed(Dense(1))(value) # Concat in the action policy to inform the screen policy action_policy_dense = TimeDistributed( Dense(screen_width * screen_height, use_bias=True, activation='relu'))(K.stop_gradient(action_policy)) action_policy_dense = TimeDistributed( Reshape((screen_width, screen_height, 1)))(action_policy_dense) screen_core = TimeDistributed( Concatenate(axis=3))([core, action_policy_dense]) screen_policy = TimeDistributed(Conv2D(5, 3, padding='same'))(screen_core) screen_policy = TimeDistributed(BatchNormalization())(screen_policy) screen_policy = TimeDistributed(Activation('relu'))(screen_policy) screen_policy = TimeDistributed(Conv2D(1, 3, padding='same'))(screen_policy) screen_policy = TimeDistributed(Flatten())(screen_policy) screen_policy = TimeDistributed(Activation('softmax'))(screen_policy) model = Model([screen_input, action_input, select_input], [screen_policy, action_policy, value]) return model
class Critic(): def __init__(self, ALPHA, lambda_=0, Gamma=0.99, n_actions=4, layer1_size=16, layer2_size=16, input_dims=8): self.gamma = Gamma # self.lr = ALPHA self.lambda_ = lambda_ self.input_dims = input_dims self.h1_dims = layer1_size self.h2_dims = layer2_size self.n_actions = n_actions self.critic = self.build_polic_network() self.optimizer = RMSprop(learning_rate=ALPHA) self.actions_space = [i for i in range(n_actions)] def build_polic_network(self): #Build the Network input = Input(shape=(self.input_dims, )) # no hidden layer if (self.h1_dims == 0 and self.h2_dims == 0): Q_values = Dense(self.n_actions, activation='linear')(input) #One hidden layer elif (self.h1_dims != 0 and self.h2_dims == 0): dense1 = Dense(self.h1_dims, activation='relu', kernel_regularizer=l2(0.01))(input) Q_values = Dense(self.n_actions, activation='linear')(dense1) #Two hidden layers else: dense1 = Dense(self.h1_dims, activation='relu', kernel_regularizer=l2(0.01))(input) dense2 = Dense(self.h2_dims, activation='relu', kernel_regularizer=l2(0.01))(dense1) Q_values = Dense(self.n_actions, activation='linear')(dense2) critic = Model(inputs=[input], outputs=[Q_values]) critic.summary() #Eligibilty traces are intialized to zero # tvs = critic.trainable_variables # self.eligibilty = [tf.Variable(tf.zeros_like(tv.initialized_value()), trainable=False) for tv in tvs] return critic def initialize_eligibility(self, observation, action): state = observation[np.newaxis, :] #Get gradient of Q function with tf.GradientTape() as tape: Qvalues = self.critic(state) tvs = self.critic.trainable_variables Q = Qvalues[0, action] #Calculating Gradient on Q of current state and action with respect to weights(bias included) of the network grads = tape.gradient(Q, tvs) self.eligibilty = grads def learn(self, reward, next_state, next_action, Q, done): # weights = self.critic.get_weights() #Get gradient of Q function with tf.GradientTape() as tape: Qvalues = self.critic(next_state) tvs = self.critic.trainable_variables next_Q = Qvalues[0, next_action] #Calculating Gradient on Q of current state and action with respect to weights(bias included) of the network grads = tape.gradient(next_Q, tvs) Q_ = np.array(next_Q) # print(Q,Q_) #When done is true no need to take value of next state and change only the target value of present action TD_error = reward + self.gamma * Q_ * (1 - int(done)) - Q #Update weights # weights = weights + self.lr * TD_error * self.eligibilty td_el = TD_error * self.eligibilty for grad_el in td_el: norm = np.linalg.norm(grad_el) if norm != 0.0: grad_el = grad_el / norm # print("he") self.optimizer.apply_gradients( zip(td_el, self.critic.trainable_variables)) #Update Eligibility Traces # self.eligibilty = [self.gamma * self.lambda_ * elg for elg in self.eligibilty] self.eligibilty = [ (self.gamma * self.lambda_ * self.eligibilty[i]) + grad for i, grad in enumerate(grads) ] # print(TD_error) def save_model(self, name): self.critic.save(name) def load_model(self, name): self.critic = load_model(name)
class PPO: def __init__(self, action_dim, k, clip_norm=None, optim="adam", write_weights=False, gamma=0.9, eps=0.2, actor_lr=0.0001, critic_lr=0.0002, actor_update_steps=10, critic_update_steps=10): self.action_dim = action_dim self.k = k self.gamma = gamma self.eps = eps self.actor_lr = actor_lr self.critic_lr = critic_lr self.actor_update_steps = actor_update_steps self.critic_update_steps = critic_update_steps self.actor = Actor(action_dim, k) self.critic = Critic() if optim == "adam": self.actor_optim = Adam(actor_lr, clipnorm=clip_norm) \ if clip_norm is not None else Adam(actor_lr) self.critic_optim = Adam(critic_lr, clipnorm=clip_norm) \ if clip_norm is not None else Adam(critic_lr) elif optim == "rms": self.actor_optim = RMSprop(actor_lr, clipnorm=clip_norm) \ if clip_norm is not None else RMSprop(actor_lr) self.critic_optim = RMSprop(critic_lr, clipnorm=clip_norm) \ if clip_norm is not None else RMSprop(critic_lr) self.actor_old = Actor(action_dim, k) self.write_weights = write_weights def choose_action(self, state): # currently the state should be (1, s_dim = |V| * 4) action_probs = self.actor.predict(state) # (1, a_dim = |E|) dist = tfd.Categorical(probs=action_probs) action = self.sample_without_replacement(action_probs, self.k) # (1, k) action = tf.squeeze(action) return action.numpy().tolist(), tf.clip_by_value(tf.squeeze( tf.math.reduce_prod(dist.prob(action)) / \ (self.actor.normalizer(action_probs, self.k)).numpy() + self.actor.eps), 1e-16, 1) # (k,), () def sample_without_replacement(self, p, k): z = -tf.math.log(-tf.math.log(tf.random.uniform(tf.shape(p), 0, 1))) z = tf.cast(z, tf.double) pr = tf.cast(p, tf.double) _, indices = tf.math.top_k(tf.math.log(pr) + z, k) return indices def get_v(self, state): v_tensor = self.critic(state) # (1, 1) return v_tensor.numpy()[0, 0] def update(self, memory: Memory, discounted_rewards, writer, name, step): self.actor_old.set_weights(self.actor.get_weights()) # pi_old_a, pi_old = self.actor_old.evaluate_probs(memory.states, # memory.actions) pi_old_a = np.array(memory.probs) memory_state_values = self.critic(np.array(memory.states)) memory_state_values = tf.squeeze(memory_state_values) # (N, ) discounted_rewards_arr = np.array(discounted_rewards) advantages = discounted_rewards_arr - memory_state_values for au in range(self.actor_update_steps): with tf.GradientTape() as ag: pi_theta_a = self.actor.evaluate_probs(memory.states, memory.actions) # pi_old = tf.stop_gradient(tf.convert_to_tensor(memory.probs)) ratios = pi_theta_a / pi_old_a # (N,) surr1 = ratios * advantages surr2 = tf.clip_by_value(ratios, 1 - self.eps, 1 + self.eps) * advantages actor_loss = -tf.reduce_mean(tf.minimum(surr1, surr2)) gradient = ag.gradient(actor_loss, self.actor.trainable_variables) self.actor_optim.apply_gradients( zip(gradient, self.actor.trainable_variables)) #kl_div = tf.keras.losses.KLDivergence()(pi_old, pi_theta) #with writer.as_default(): # tf.summary.scalar("kl div", kl_div.numpy(), step=au) for cu in range(self.critic_update_steps): with tf.GradientTape() as cg: memory_state_values = self.critic(np.array(memory.states)) memory_state_values = tf.squeeze(memory_state_values) advantages = discounted_rewards_arr - memory_state_values critic_loss = tf.reduce_mean(tf.square(advantages)) gradient = cg.gradient(critic_loss, self.critic.trainable_variables) self.critic_optim.apply_gradients( zip(gradient, self.critic.trainable_variables)) def init_ac(self, state): self.actor_old.predict(state) self.actor.predict(state) self.critic.predict(state) def load_ac(self, a_weights, c_weights): self.actor_old.load_weights(a_weights) self.actor.load_weights(a_weights) self.critic.load_weights(c_weights)
class CGAN: """Generate y conditioned on x.""" def __init__(self, x_features, y_features, latent_dim=32, g_hidden=32, d_hidden=32, label_smooth=0.9, d_dropout=0.1, gp_weight=1, ds_weight=1): self.x_features = x_features self.y_features = y_features self.latent_dim = latent_dim self.g_hidden = g_hidden self.d_hidden = d_hidden self.label_smooth = label_smooth self.d_dropout = d_dropout self.gp_weight = gp_weight self.ds_weight = ds_weight self.g_optimizer = Adam(0.0001) self.d_optimizer = RMSprop(0.0001) self.generator = self.build_generator() self.discriminator = self.build_discriminator() def build_generator(self): """Generator model consists of a dense layer after each component.""" noise = Input(shape=(self.latent_dim, )) # noise d_noise = Dense(self.g_hidden)(noise) x = Input(shape=(self.x_features, )) # condition d_x = Dense(self.g_hidden)(x) z = Concatenate()([d_noise, d_x]) d_z = Dense(self.g_hidden)(z) y = Dense(self.y_features)(d_z) return Model([noise, x], y) def build_discriminator(self): """Discriminator model consists of a dense layer after each component.""" x = Input(shape=(self.x_features)) # condition d_x = Dense(self.d_hidden)(x) y = Input(shape=(self.y_features)) # y d_y = Dense(self.d_hidden)(y) h = Concatenate()([d_x, d_y]) h = Dense(self.d_hidden)(h) h = Dropout(self.d_dropout)(h) p = Dense(1)(h) return Model([y, x], p) def g_loss(self, fake_pred): return -tf.math.reduce_mean(fake_pred) def d_loss(self, real_pred, fake_pred): return -tf.math.reduce_mean( real_pred * self.label_smooth) + tf.math.reduce_mean(fake_pred) def loss(self, X, y): noise = tf.random.normal((X.shape[0], self.latent_dim)) fake_y = self.generator([noise, X]) fake_pred = self.discriminator([fake_y, X]) return self.g_loss(fake_pred) def gradient_penalty(self, real_y, fake_y, X): """Gradient penalty on discriminator""" batch_size = real_y.shape[0] epsilon = tf.random.normal([batch_size, self.y_features], 0.0, 1.0) interpolate_y = epsilon * real_y + (1 - epsilon) * fake_y with tf.GradientTape() as gp_tape: gp_tape.watch(interpolate_y) pred = self.discriminator([interpolate_y, X], training=True) gradients = gp_tape.gradient(pred, [interpolate_y]) norm = tf.sqrt(tf.reduce_sum(tf.square(gradients), axis=1)) gp = tf.reduce_mean((norm - 1.0)**2) return gp * self.gp_weight def diversity_score(self, X): batch_size = X.shape[0] z1 = tf.random.normal([batch_size, self.latent_dim]) z2 = tf.random.normal([batch_size, self.latent_dim]) y1 = self.generator([z1, X], training=True) y2 = self.generator([z2, X], training=True) denom = tf.reduce_mean(tf.abs(z1 - z2), axis=1) numer = tf.reduce_mean(tf.abs(y1 - y2), axis=1) ds = tf.reduce_mean(numer / denom) return tf.math.minimum( ds, 0.1) * self.ds_weight # lower bound for numerical stability @tf.function def train_step(self, X, real_y): noise = tf.random.normal((X.shape[0], self.latent_dim)) with tf.GradientTape() as g_tape, tf.GradientTape() as d_tape: fake_y = self.generator([noise, X], training=True) real_pred = self.discriminator([real_y, X], training=True) fake_pred = self.discriminator([fake_y, X], training=True) gp = self.gradient_penalty(real_y, fake_y, X) ds = self.diversity_score(X) g_loss = self.g_loss(fake_pred) - ds d_loss = self.d_loss(real_pred, fake_pred) + gp g_gradients = g_tape.gradient(g_loss, self.generator.trainable_variables) d_gradients = d_tape.gradient(d_loss, self.discriminator.trainable_variables) self.g_optimizer.apply_gradients( zip(g_gradients, self.generator.trainable_variables)) self.d_optimizer.apply_gradients( zip(d_gradients, self.discriminator.trainable_variables)) return g_loss, d_loss def fit(self, X, y, epochs=1000, verbose=1, plot=False, logdir='cgan'): # Tensorboard current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") train_log_dir = 'logs/' + logdir + '/' + current_time train_summary_writer = tf.summary.create_file_writer(train_log_dir) for epoch in range(epochs): g_loss, d_loss = self.train_step(X, y) with train_summary_writer.as_default(): tf.summary.scalar('Generator Loss', g_loss, step=epoch) tf.summary.scalar('Discriminator Loss', d_loss, step=epoch) if verbose and epoch % (epochs // 10) == 0: print(f"{epoch} [D loss: {d_loss}] [G loss: {g_loss}]") def predict(self, X): noise = tf.random.normal((X.shape[0], self.latent_dim)) return self.generator([noise, X]).numpy()
class Trainer: def __init__(self, util: Utils, hr_size=96, log_dir: str = None, num_resblock: int = 16): self.vgg = self.vgg(20) self.learning_rate = 0.00005 self.clipping = 0.01 self.generator_optimizer = RMSprop(learning_rate=self.learning_rate, clipvalue=self.clipping) self.discriminator_optimizer = RMSprop( learning_rate=self.learning_rate, clipvalue=self.clipping) self.binary_cross_entropy = BinaryCrossentropy(from_logits=True) self.mean_squared_error = MeanSquaredError() self.util: Utils = util self.HR_SIZE = hr_size self.LR_SIZE = self.HR_SIZE // 4 if log_dir is not None: self.summary_writer = tf.summary.create_file_writer(log_dir) if log_dir.startswith('../'): log_dir = log_dir[len('../'):] print('open tensorboard with: tensorboard --logdir ' + log_dir) else: self.summary_writer = None self.generator = make_generator_model(num_res_blocks=num_resblock) self.discriminator = make_discriminator_model(self.HR_SIZE) self.checkpoint = tf.train.Checkpoint(generator=self.generator, discriminator=self.discriminator) def summary(self): print('Discrimantor:') print(self.discriminator.summary()) print('Generator: \n') print(self.generator.summary()) def vgg(self, output_layer): vgg = VGG19(input_shape=(None, None, 3), include_top=False) return Model(vgg.input, vgg.layers[output_layer].output) def train_generator(self, train_dataset, valid_dataset, epochs=20000, valid_lr=None, valid_hr=None): evaluate_size = epochs / 10 loss_mean = Mean() start_time = time.time() epoch = 0 for lr, hr in train_dataset.take(epochs): epoch += 1 step = tf.convert_to_tensor(epoch, dtype=tf.int64) generator_loss = self.train_generator_step(lr, hr) loss_mean(generator_loss) if epoch % 50 == 0: loss_value = loss_mean.result() loss_mean.reset_states() psnr_value = self.evaluate(valid_dataset.take(1)) print( f'Time for epoch {epoch}/{epochs} is {(time.time() - start_time):.4f} sec, ' f'gan loss = {loss_value:.4f}, psnr = {psnr_value:.4f}') start_time = time.time() if self.summary_writer is not None: with self.summary_writer.as_default(): tf.summary.scalar('generator_loss', loss_value, step=epoch) tf.summary.scalar('psnr', psnr_value, step=epoch) if epoch % evaluate_size == 0: self.util.save_checkpoint(self.checkpoint, epoch) if epoch % 5000 == 0: self.generate_and_save_images(step, valid_lr, valid_hr) def train_gan(self, train_dataset, valid_dataset, epochs=200000, valid_lr=None, valid_hr=None): evaluate_size = epochs / 10 start = time.time() vgg_metric = Mean() dls_metric = Mean() g_metric = Mean() c_metric = Mean() epoch = 0 for lr, hr in train_dataset.take(epochs): epoch += 1 step = tf.convert_to_tensor(epoch, tf.int64) vgg_loss, discremenator_loss, generator_loss, content_loss = self.train_gan_step( lr, hr) vgg_metric(vgg_loss) dls_metric(discremenator_loss) g_metric(generator_loss) c_metric(content_loss) if epoch % 50 == 0: vgg = vgg_metric.result() discriminator_loss_metric = dls_metric.result() generator_loss_metric = g_metric.result() content_loss_metric = c_metric.result() vgg_metric.reset_states() dls_metric.reset_states() g_metric.reset_states() c_metric.reset_states() psnr_value = self.evaluate(valid_dataset.take(1)) print( f'Time for epoch {epoch}/{epochs} is {(time.time() - start):.4f} sec, ' f' perceptual loss = {vgg:.4f},' f' generator loss = {generator_loss_metric:.4f},' f' discriminator loss = {discriminator_loss_metric:.4f},' f' content loss = {content_loss_metric:.4f},' f' psnr = {psnr_value:.4f}') start = time.time() if self.summary_writer is not None: with self.summary_writer.as_default(): tf.summary.scalar('generator_loss', generator_loss_metric, step=epoch) tf.summary.scalar('content loss', content_loss_metric, step=epoch) tf.summary.scalar( 'vgg loss = content loss + 0.0001 * gan loss', vgg, step=epoch) tf.summary.scalar('discremenator_loss', discriminator_loss_metric, step=epoch) tf.summary.scalar('psnr', psnr_value, step=epoch) if epoch % evaluate_size == 0: self.util.save_checkpoint(self.checkpoint, epoch) if epoch % 5000 == 0: self.generate_and_save_images(step, valid_lr, valid_hr) @tf.function def train_generator_step(self, lr, hr): with tf.GradientTape() as tape: lr = tf.cast(lr, tf.float32) hr = tf.cast(hr, tf.float32) fake_image = self.generator(lr, training=True) loss_value = self.mean_squared_error(hr, fake_image) gradients = tape.gradient(loss_value, self.generator.trainable_variables) self.generator_optimizer.apply_gradients( zip(gradients, self.generator.trainable_variables)) return loss_value @tf.function def train_gan_step(self, lr, hr): with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape: lr = tf.cast(lr, tf.float32) hr = tf.cast(hr, tf.float32) fake_image = self.generator(lr, training=True) real_classification = self.discriminator(hr, training=True) fake_classification = self.discriminator(fake_image, training=True) content_loss = self.content_loss(hr, fake_image) generator_loss = self.generator_loss(fake_image) # lpips_loss = self.lpips_loss(hr, fake_image) vgg_loss = content_loss + 0.001 * generator_loss # print('lpips: ' + str(lpips_loss)) # loss = generator_loss + 100 * lpips_loss discremenator_loss = self.discriminator_loss( real_classification, fake_classification) gradients_of_generator = gen_tape.gradient( vgg_loss, self.generator.trainable_variables) gradients_of_discriminator = disc_tape.gradient( discremenator_loss, self.discriminator.trainable_variables) self.generator_optimizer.apply_gradients( zip(gradients_of_generator, self.generator.trainable_variables)) self.discriminator_optimizer.apply_gradients( zip(gradients_of_discriminator, self.discriminator.trainable_variables)) return vgg_loss, discremenator_loss, generator_loss, content_loss # Loss functions: def lpips_loss(self, hr, fake_image): nhr = hr.numpy() nfi = fake_image.numpy() print(nhr.shape) print(nfi.shape) return self.loss_fn_vgg(nhr, nfi) @tf.function def content_loss(self, hr, fake_image): fake_image = preprocess_input(fake_image) hr = preprocess_input(hr) fake_features = self.vgg(fake_image) / 12.75 hr_features = self.vgg(hr) / 12.75 return self.mean_squared_error(hr_features, fake_features) @tf.function def discriminator_loss(self, real_class, fake_class): # hr_loss = self.binary_cross_entropy(tf.ones_like(real_class), real_class) # fake_loss = self.binary_cross_entropy(tf.zeros_like(fake_class), fake_class) # return hr_loss + fake_loss return tf.reduce_mean(fake_class) - tf.reduce_mean(real_class) @tf.function def generator_loss(self, fake_class): gan_loss = -tf.reduce_mean(fake_class) # gan_loss = self.binary_cross_entropy(tf.ones_like(fake_class), fake_class) return gan_loss # Helper def save_model(self, appendix=''): self.util.save_model(self.generator, 'generator' + appendix) self.util.save_model(self.discriminator, 'discriminator' + appendix) def generate_and_save_images(self, step, lr, hr): epoch = tf.cast(step, tf.int64) plt.close('all') generated = self.util.resolve_single(self.generator, lr) plt.figure(figsize=(15, 30), clear=True) figures = [lr, generated, hr] titles = ['LR', 'Generated', 'HR'] for i in range(3): plt.subplot(3, 1, 1 + i) plt.title(titles[i]) plt.imshow(figures[i] / 255) plt.axis('off') plt.xticks([]) plt.yticks([]) fig = plt.gcf() self.util.save_figure(fig, epoch) def evaluate(self, dataset): psnr_values = [] for lr, hr in dataset: sr = self.util.resolve(self.generator, lr) psnr_value = self.psnr(hr, sr)[0] psnr_values.append(psnr_value) return tf.reduce_mean(psnr_values) def psnr(self, x1, x2): return tf.image.psnr(x1, x2, max_val=255) def load_generator(self, file): self.generator.load_weights(file) def load_discriminator(self, file): self.discriminator.load_weights(file) def load_checkpoint(self, file): self.checkpoint.restore( tf.train.latest_checkpoint(file)).assert_consumed()
class Agent: def __init__(self, env_id, debug, use_DDQN, mem_limit, render): # hyper-parameters self.discount_factor = 0.99 self.minibatch_size = 32 self.update_frequency = 4 self.target_network_update_frequency = 10000 if not use_DDQN else 30000 self.history_len = 4 self.memory_size = int( 0.9 * (float(mem_limit) * (1024**3) / ((84 * 84 * 4 * 2 + 4 + 4 + 1) + (2 * 4)) if mem_limit else 130000)) if not debug else 5000 self.init_explr = 1.0 self.final_explr = 0.1 self.final_explr_frame = 1000000 self.terminal_explr = 0.01 self.terminal_explr_frame = self.final_explr_frame * 10 self.replay_start_size = 50000 if not debug else 3000 self.training_frames = int(1e7) self.learning_rate = 0.00025 self.momentum = 0.95 self.frame_skip = 4 # frames limit self.fps = 60 self.max_playing_time = 5 # minutes self.total_frames_limit = self.fps * 60 * self.max_playing_time # environment self.env_id = env_id self.env = AtariEnvironment(self.env_id, self.total_frames_limit) # other parameters self.action_num = self.env.get_action_num() self.latest_record_num = 100 self.print_info_interval = 20 if not debug else 1 self.save_weight_interval = 200 if not debug else 3 self.highest_score = -1e9 self.use_DDQN = use_DDQN self.render = True if render else False # network self.memory = PrioritizedReplayMemory( minibatch_size=self.minibatch_size, memory_size=self.memory_size, history_len=self.history_len) self.main_network = Network(action_num=self.action_num, history_len=self.history_len) self.target_network = Network(action_num=self.action_num, history_len=self.history_len) # self.optimizer = Adam(lr=self.learning_rate, epsilon=1e-6) self.optimizer = RMSprop(learning_rate=self.learning_rate, momentum=self.momentum, epsilon=1e-2) self.loss = tf.keras.losses.Huber() self.loss_metric = tf.keras.metrics.Mean() self.q_metric = tf.keras.metrics.Mean() # other tools (log, summary) self.log_path = ("drive/My Drive/AtariGamer/" if not debug else "./") + "log/" + datetime.now().strftime( "%Y%m%d_%H%M%S") + "_" + self.env_id print("- DDQN:", ("YES" if self.use_DDQN else "NO")) @tf.function def get_action(self, state, exploration_rate): """ get action by ε-greedy algorithm :param state: current state :param exploration_rate: current exploration rate :return: action, an integer """ if tf.random.uniform( (), minval=0, maxval=1, dtype=tf.float32 ) < exploration_rate: # explore: randomly choose action action = tf.random.uniform((), minval=0, maxval=self.action_num, dtype=tf.int32) else: q_value = self.main_network( tf.cast(tf.expand_dims(state, axis=0), tf.float32)) action = tf.cast(tf.squeeze(tf.argmax(q_value, axis=1)), dtype=tf.int32) return action @tf.function def get_explr(self, frames): """ get exploration rate using linear annealing :param frames: the number of frames passed :return: exploration rate, a float """ if frames < self.replay_start_size: explr = self.init_explr elif frames < self.final_explr_frame: explr = self.init_explr + (self.final_explr - self.init_explr) / ( self.final_explr_frame - self.replay_start_size) * (frames - self.replay_start_size) elif frames < self.terminal_explr_frame: explr = self.final_explr + ( self.terminal_explr - self.final_explr) / ( self.terminal_explr_frame - self.final_explr_frame) * (frames - self.final_explr_frame) else: explr = self.terminal_explr return explr @tf.function def update_main_network_natural(self, state_batch, action_batch, reward_batch, next_state_batch, terminated_batch, weight_batch): """ update main Q network by experience replay :param weight_batch: importance sampling weight :param state_batch: batch of states :param action_batch: batch of actions :param reward_batch: batch of rewards :param next_state_batch: batch of next states :param terminated_batch: batch of whether it is terminated :return: Huber loss """ with tf.GradientTape() as tape: next_state_q = self.target_network(next_state_batch) next_state_max_q = tf.reduce_max(next_state_q, axis=1) expected_q = reward_batch + self.discount_factor * next_state_max_q * ( 1.0 - tf.cast(terminated_batch, tf.float32)) main_q = tf.reduce_sum(self.main_network(state_batch) * tf.one_hot( action_batch, self.action_num, on_value=1.0, off_value=0.0), axis=1) loss = self.loss(tf.stop_gradient(expected_q), main_q, weight_batch) gradients = tape.gradient(loss, self.main_network.trainable_variables) clipped_gradients = [tf.clip_by_norm(grad, 10) for grad in gradients] self.optimizer.apply_gradients( zip(clipped_gradients, self.main_network.trainable_variables)) self.loss_metric.update_state(loss) self.q_metric.update_state(main_q) return tf.abs(main_q - expected_q) @tf.function def update_main_network_DDQN(self, state_batch, action_batch, reward_batch, next_state_batch, terminated_batch, weight_batch): """ update main Q network by experience replay :param weight_batch: importance sampling weight :param state_batch: batch of states :param action_batch: batch of actions :param reward_batch: batch of rewards :param next_state_batch: batch of next states :param terminated_batch: batch of whether it is terminated :return: Huber loss """ with tf.GradientTape() as tape: main_next_state_q_list = self.main_network(next_state_batch) target_next_state_q_list = self.target_network(next_state_batch) max_action = tf.argmax(main_next_state_q_list, axis=-1) next_state_q = tf.reduce_sum(target_next_state_q_list * tf.one_hot( max_action, self.action_num, on_value=1.0, off_value=0.0), axis=1) expected_q = reward_batch + self.discount_factor * next_state_q * ( 1.0 - tf.cast(terminated_batch, tf.float32)) main_q_list = self.main_network(state_batch) main_q = tf.reduce_sum(main_q_list * tf.one_hot( action_batch, self.action_num, on_value=1.0, off_value=0.0), axis=1) loss = self.loss(tf.stop_gradient(expected_q), main_q, weight_batch) gradients = tape.gradient(loss, self.main_network.trainable_variables) clipped_gradients = [tf.clip_by_norm(grad, 10) for grad in gradients] self.optimizer.apply_gradients( zip(clipped_gradients, self.main_network.trainable_variables)) self.loss_metric.update_state(loss) self.q_metric.update_state(main_q) return tf.abs(main_q - expected_q) @tf.function def update_target_network(self): """ synchronize weights of target network with main network """ main_weights = self.main_network.trainable_variables target_weights = self.target_network.trainable_variables for main_v, target_v in zip(main_weights, target_weights): target_v.assign(main_v) def train(self, load_path=None): if load_path: loaded_checkpoints = tf.train.latest_checkpoint(load_path) self.main_network.load_weights(loaded_checkpoints) self.target_network.load_weights(loaded_checkpoints) self.init_explr = self.terminal_explr self.final_explr = self.terminal_explr self.memory.beta = self.memory.beta0 frames = 0 episodes = 0 latest_scores = deque(maxlen=self.latest_record_num) while frames < self.training_frames: cur_state = self.env.reset() episode_reward = 0 terminated = False last_action = 0 while not terminated: if frames % self.frame_skip == 0: action = last_action else: explr = self.get_explr( tf.constant(frames, dtype=tf.float32)) action = self.get_action( tf.constant(cur_state, dtype=tf.uint8), tf.constant(explr, dtype=tf.float32)) last_action = action next_state, reward, terminated, _ = self.env.step(action) episode_reward += reward self.memory.push(cur_state, action, reward, next_state, terminated) cur_state = next_state if frames > self.replay_start_size: if frames % self.update_frequency == 0: (state_batch, action_batch, reward_batch, next_state_batch, terminated_batch), \ ptr_batch, imp_samp_weight_batch = self.memory.sample() update_func = self.update_main_network_DDQN if self.use_DDQN else self.update_main_network_natural abs_error_batch = update_func( state_batch, action_batch, reward_batch, next_state_batch, terminated_batch, tf.expand_dims(imp_samp_weight_batch, -1)) self.memory.update(ptr_batch, abs_error_batch) if frames % self.target_network_update_frequency == 0: self.update_target_network() frames += 1 if terminated: latest_scores.append(episode_reward) episodes += 1 if episodes % self.print_info_interval == 0: print( "[" + datetime.now().strftime("%m.%d %H:%M:%S") + "] Episode: {}\t Latest {} average score: {:.2f}\t Progress: {} / {} ( {:.2f} % )" .format(episodes, self.latest_record_num, np.mean(latest_scores), frames, self.training_frames, frames / self.training_frames * 100)) if episodes % self.save_weight_interval == 0: average_score = self.play(None, 10) if average_score > self.highest_score: self.highest_score = average_score print("Weights saving...", end="") self.main_network.save_weights( self.log_path + "/score_{}".format(average_score)) print("Done!") def play(self, load_path, trials): if load_path is not None: loaded_checkpoints = tf.train.latest_checkpoint(load_path) self.main_network.load_weights(loaded_checkpoints) env = AtariEnvironment(self.env_id, self.total_frames_limit, clip_rewards=False, episode_life=False) reward_list = [] frame_list = [] for t in range(trials): cur_state = env.reset() frames = [] episode_reward = 0 terminated = False while not terminated: if self.render: frames.append(env.render()) action = self.get_action( tf.constant(cur_state, dtype=tf.uint8), tf.constant(0.0, dtype=tf.float32)) next_state, reward, terminated, _ = env.step(action) episode_reward += reward cur_state = next_state reward_list.append(episode_reward) frame_list.append(frames) print("Scores on {} trials: ".format(trials), reward_list) print("Highest score: ", np.max(reward_list)) print("Average score: ", np.mean(reward_list)) best_idx = int(np.argmax(reward_list)) if self.render: imageio.mimsave(self.env_id + ".gif", frame_list[best_idx], fps=self.fps) return np.mean(reward_list)
class PenguinAgent: def __init__(self): self.model = self.build_model() self.opt = RMSprop(lr=LEARNING_RATE) self.recorder = [Episode()] def build_input(self, state): map_input = np.zeros((5, 11, 8), dtype=np.float32) map_input[0] = state['fish'] map_input[1] = state['penguins'] map_input[2] = np.full((11, 8), state['score'][0]) map_input[3] = np.full((11, 8), state['score'][1]) map_input[4] = np.full((11, 8), np.float32(state['phase'])) map_input = np.moveaxis(map_input, -1, 0) # Change to channels last map_input = np.reshape(map_input, (1, 11, 8, 5)) return map_input def step(self, state, player, training=True): map_input = self.build_input(state) policy, value = self.model([map_input]) policy = np.squeeze(policy) target = None destination = None mask = np.zeros((11, 8, 2), dtype=np.float32) if state['phase'] == 0: choices = np.zeros(len(state['placements'])) for ndx, tile in enumerate(state['placements']): mask[tile[0]][tile[1]][0] = 1.0 choices[ndx] = policy[tile[0]][tile[1]][0] choices = K.exp(choices) / (K.sum(K.exp(choices))) if training: target_ndx = np.random.choice(len(state['placements']), p=choices) else: target_ndx = np.argmax(choices) target = state['placements'][target_ndx] destination = None elif state['phase'] == 1: # TODO: MCTS choices = np.zeros(len(state['moves'].keys())) options = list(state['moves'].keys()) for ndx, tile in enumerate(options): mask[tile[0]][tile[1]][0] = 1.0 choices[ndx] = policy[tile[0]][tile[1]][0] choices = K.exp(choices) / (K.sum(K.exp(choices))) if training: target_ndx = np.random.choice(len(options), p=choices) else: target_ndx = np.argmax(choices) target = options[target_ndx] choices = np.zeros(len(state['moves'][target])) for ndx, tile in enumerate(state['moves'][target]): mask[tile[0]][tile[1]][1] = 1.0 choices[ndx] = policy[tile[0]][tile[1]][1] choices = K.exp(choices) / (K.sum(K.exp(choices))) if training: destination_ndx = np.random.choice(len(state['moves'][target]), p=choices) else: destination_ndx = np.argmax(choices) destination = state['moves'][target][destination_ndx] self.recorder[-1].save_step(map_input, value, policy, mask, player, target, destination) return target, destination def step_end(self, rewards): self.recorder[-1].set_rewards(0, rewards[0]) self.recorder[-1].set_rewards(1, rewards[1]) self.recorder.append(Episode()) def train(self): loss = np.array([0., 0., 0.]) loss += self._train( np.concatenate([ep.map_input[:ep.current_step] for ep in self.recorder]), np.concatenate([ep.reward[:ep.current_step] for ep in self.recorder]), np.concatenate([ep.policy_mask[:ep.current_step] for ep in self.recorder]), np.concatenate([ep.policy_one_hot[:ep.current_step] for ep in self.recorder]) ) self.recorder = [Episode()] # Clear recorder after training return loss def _train(self, map_input, reward, policy_mask, policy_one_hot): _entropy = _policy_loss = _value_loss = 0. policy_mask = policy_mask.astype('float32') with tf.GradientTape() as tape: policy, value = self.model(map_input) value = K.squeeze(value, axis=1) policy = K.exp(policy) / (K.sum(K.exp(policy))) value_loss = .5 * K.square(reward - value) # Should I use policy * policy_mask here? entropy = -K.sum(policy * K.log(policy + 1e-10), axis=[1, 2, 3]) log_prob = K.log(K.sum(policy * policy_one_hot, axis=[1, 2, 3]) + 1e-10) advantage = reward - K.stop_gradient(value) policy_loss = -log_prob * advantage - entropy * ENTROPY_RATE total_loss = policy_loss + value_loss _entropy = K.mean(entropy) _policy_loss = K.mean(K.abs(policy_loss)) _value_loss = K.mean(value_loss) gradients = tape.gradient(total_loss, self.model.trainable_variables) gradients, _ = tf.clip_by_global_norm(gradients, GRADIENT_CLIP_MAX) self.opt.apply_gradients(zip(gradients, self.model.trainable_variables)) return [float(_value_loss), float(_policy_loss), float(_entropy)] def build_model(self): K.set_floatx('float32') map_input = Input(shape=(11, 8, 5), dtype='float32') core = Conv2D(10, 3, strides=1, padding='same', input_shape=(11, 8, 5))(map_input) core = BatchNormalization()(core) core = Activation('relu')(core) core = Conv2D(10, 3, strides=1, padding='same')(core) core = BatchNormalization()(core) core = Activation('relu')(core) core = Conv2D(10, 3, strides=1, padding='same')(core) core = BatchNormalization()(core) core = Activation('relu')(core) policy = Conv2D(4, 3, strides=1, padding='same', activation='relu')(core) policy = Conv2D(2, 3, strides=1, padding='same')(policy) # 11 x 8 x 2 value = Flatten()(core) value = Dense(20, use_bias=True)(value) value = Dense(1)(value) model = Model([map_input], [policy, value]) return model def strip_reshape(self, arr): return np.reshape(arr, tuple(s for s in arr.shape if s > 1))
class RL_Brain(): def __init__(self, n_features, n_action, memory_size=10, batch_size=32, gamma=0.9, fi_size=8): self.n_features = n_features self.n_actions = n_action self.memory_size = memory_size self.replay_buffer = np.zeros((self.memory_size, n_features * 2 + 2), np.float) self.count = 0 self.batch_size = batch_size self.gamma = gamma self.opt = RMSprop() # 由于我们输入的状态向量纬度非常小(仅仅2纬度,因此没有必要再搞个auto-encoder对state进行编码解码),我们直接将state值进行输入。 self.input_states = Input((self.n_features, ), name='input_states') self.branch_1_model = keras.Sequential( [Input((2, )), Dense(1, None, False, name='R')]) self.branch_2_model = [ keras.Sequential([ Input((2, )), Dense(5, 'relu', name='mu/m%s/layer1' % i), Dense(5, 'relu', name='mu/m%s/layer2' % i), Dense(2, name='mu/m%s/layer3' % i) ], name='branch_%s' % i) for i in range(self.n_actions) ] def learn_w(self, state, r): with tf.GradientTape() as tape: pred = self.branch_1_model(state) loss = mean_squared_error(r, pred) grads = tape.gradient(loss, self.branch_1_model.trainable_variables) self.opt.apply_gradients( zip(grads, self.branch_1_model.trainable_variables)) def learn_mu(self, state, state_, action_index): w = self.branch_1_model.get_layer('R').get_weights()[0] mus_ = [] for i in range(self.n_actions): mus_.append(self.branch_2_model[i](state_)) mus_ = np.squeeze(mus_) max_index = np.argmax(np.squeeze(np.matmul(mus_, w)), axis=0) with tf.GradientTape() as tape: pred = self.branch_2_model[action_index](state) label = state + self.gamma * mus_[max_index] loss = mean_squared_error(label, pred) grads = tape.gradient( loss, self.branch_2_model[action_index].trainable_variables) self.opt.apply_gradients( zip(grads, self.branch_2_model[action_index].trainable_variables)) self.branch_2_model[action_index] = self.branch_2_model[action_index] def choose_action(self, state, is_random=False): if is_random: return np.random.choice(self.n_actions) w = self.branch_1_model.get_layer('R').get_weights()[0] mus = [] for i in range(self.n_actions): pred = self.branch_2_model[i](state) mus.append(pred) mus = np.squeeze(mus) rs = np.squeeze(np.matmul(mus, w)) if len(set(rs)) == 1: action_index = np.random.choice(self.n_actions) else: action_index = np.argmax(rs) return action_index def append_to_replay_buffer(self, s, a, r, s_): transition = np.hstack([s, a, r, s_]) self.replay_buffer[self.count % self.memory_size] = transition self.count += 1