コード例 #1
0
class Generator(tf.keras.Model):
    def __init__(self, random_noise_size=100):
        super().__init__(name='generator')
        #layers
        init = RandomNormal(stddev=0.2)
        self.dense_1 = Dense(7 * 7 * 256,
                             use_bias=False,
                             input_shape=(random_noise_size, ))
        self.batchNorm1 = BatchNormalization()
        self.leaky_1 = LeakyReLU(alpha=0.2)
        self.reshape_1 = Reshape((7, 7, 256))

        self.up_2 = UpSampling2D((1, 1), interpolation='nearest')
        self.conv2 = Conv2D(128, (3, 3),
                            strides=(1, 1),
                            padding="same",
                            use_bias=False,
                            kernel_initializer=init)
        self.batchNorm2 = BatchNormalization()
        self.leaky_2 = LeakyReLU(alpha=0.2)

        self.up_3 = UpSampling2D((2, 2), interpolation='nearest')
        self.conv3 = Conv2D(64, (3, 3),
                            strides=(1, 1),
                            padding="same",
                            use_bias=False,
                            kernel_initializer=init)
        self.batchNorm3 = BatchNormalization()
        self.leaky_3 = LeakyReLU(alpha=0.2)

        self.up_4 = UpSampling2D((2, 2), interpolation='nearest')
        self.conv4 = Conv2D(1, (3, 3),
                            activation='tanh',
                            strides=(1, 1),
                            padding="same",
                            use_bias=False,
                            kernel_initializer=init)

        self.optimizer = RMSprop(lr=0.00005)

    def call(self, input_tensor):
        ## Definition of Forward Pass
        x = self.reshape_1(
            self.leaky_1(self.batchNorm1(self.dense_1(input_tensor))))
        x = self.leaky_2(self.batchNorm2(self.conv2(self.up_2(x))))
        x = self.leaky_3(self.batchNorm3(self.conv3(self.up_3(x))))
        return self.conv4(self.up_4(x))

    def generate_noise(self, batch_size, random_noise_size):
        return tf.random.normal([batch_size, random_noise_size])

    def compute_loss(self, y_true, y_pred, class_wanted, class_y):
        """ Wasserstein loss - prob of classfier get it right
        """
        #return tf.math.subtract(backend.mean(y_true * y_pred),categorical_crossentropy(class_wanted,class_y))
        return backend.mean(y_true * y_pred) - categorical_crossentropy(
            class_wanted, class_y)

    def backPropagate(self, gradients, trainable_variables):
        self.optimizer.apply_gradients(zip(gradients, trainable_variables))
コード例 #2
0
class Critic(tf.keras.Model):
    def __init__(self):
        super().__init__(name="critic")

        init = RandomNormal(stddev=0.2)
        #Layers
        self.conv_1 = Conv2D(64, (4, 4),
                             strides=(2, 2),
                             padding='same',
                             kernel_initializer=init,
                             input_shape=[28, 28, 1])
        self.leaky_1 = LeakyReLU(alpha=0.2)
        self.dropout_1 = Dropout(0.3)

        self.conv_2 = Conv2D(128, (4, 4),
                             strides=(2, 2),
                             padding='same',
                             kernel_initializer=init)
        self.leaky_2 = LeakyReLU(alpha=0.2)
        self.dropout_2 = Dropout(0.3)

        self.flat = Flatten()
        self.logits = Dense(
            1)  # This neuron tells us if the input is fake or real

        self.optimizer = RMSprop(lr=0.00005)

    def call(self, input_tensor):
        ## Definition of Forward Pass
        x = self.dropout_1(self.leaky_1(self.conv_1(input_tensor)))
        x = self.dropout_2(self.leaky_2(self.conv_2(x)))
        x = self.flat(x)
        return self.logits(x)

    def compute_loss(self, y_true, y_pred, grad_p):
        """ Wasserstein loss
        """
        lambda_ = 10.0
        return backend.mean(y_true * y_pred) + (lambda_ * grad_p)

    def backPropagate(self, gradients, trainable_variables):
        self.optimizer.apply_gradients(zip(gradients, trainable_variables))
コード例 #3
0
ファイル: tf_sample.py プロジェクト: take610/jax_sample
class Network:
    def __init__(self, input_shape, output_shape):
        self.input_shape = input_shape
        self.output_shape = output_shape
        self.optimizer = RMSprop(0.01)
        # self.optimizer = SGD(0.001, momentum=0.9)
        self.model = self._build_model()

    def loss(self, inputs, targets):
        y = self.model(inputs)
        cross_entropy_loss = -tf.reduce_sum(targets * tf.math.log(y + 1e-6),
                                            axis=1)
        loss = tf.reduce_mean(cross_entropy_loss)
        return loss

    def accuracy(self, inputs, targets):
        y = self.model(inputs)
        acc = tf.cast(
            tf.equal(tf.argmax(targets, axis=1), tf.argmax(y, axis=1)),
            tf.float32)
        acc = tf.reduce_sum(acc)
        p = acc / y.shape[0]
        return p

    def train(self, inputs, targets):
        with tf.GradientTape() as tape:
            loss = self.loss(inputs, targets)
        grads = tape.gradient(loss, self.model.trainable_variables)
        grads_and_vars = zip(grads, self.model.trainable_variables)
        self.optimizer.apply_gradients(grads_and_vars)
        return loss

    def _build_model(self):
        input_x = Input(shape=(self.input_shape, ))
        x = Dense(64, activation=relu)(input_x)
        x = Dense(64, activation=relu)(x)
        output = Dense(self.output_shape, activation=softmax)(x)
        model = Model(inputs=input_x, outputs=[output])
        return model
コード例 #4
0
class MainAgent:
    def __init__(self, name='agent', reward_weights=None):
        self.reward = 0
        self.episode = 0
        self.name = name

        # Default reward weights
        self.reward_weights = {
            'enemy_killed_value': 1,
            'friendly_killed_value': 1,
            'killed_value': 1,
            'damage_taken': 1,
            'damage_given': 1,
            'damage': 1,
            'outcome': 1,
        }
        if reward_weights:
            self.reward_weights.update(reward_weights)

        self.last_obs = None
        self.recorder = []

        self.model = self.build_model(SCREEN_SIZE, SCREEN_SIZE, SCREEN_DEPTH,
                                      UNIT_TENSOR_LENGTH, len(ACTION_OPTIONS))
        self.opt = RMSprop(lr=LEARNING_RATE)

        # How to convert blizzard unit and building IDs to our subset of units
        def convert_unit_ids(x):
            if x in UNIT_OPTIONS:
                return (UNIT_OPTIONS.index(x) + 1.) / len(UNIT_OPTIONS)
            return 0.

        self.convert_unit_ids = convert_unit_ids
        self.convert_unit_ids_vect = np.vectorize(convert_unit_ids)

        # How to convert 'player_relative' data
        def convert_player_ids(x):
            if x == 1:  # Self
                return 1.
            elif x == 4:  # Enemy
                return -1.
            else:  # Background usually
                return 0.

        self.convert_player_ids = convert_player_ids
        self.convert_player_ids_vect = np.vectorize(convert_player_ids)

    def reset(self):
        self.recorder = [Episode()]
        self.episode = 0

    def next_episode(self):
        self.episode += 1
        self.recorder.append(Episode())
        self.last_obs = None

    # Train model with recorded game data
    def train(self):
        loss = np.array([0., 0., 0., 0.])
        for ep in self.recorder:
            loss += self._train(
                ep.screen_input[:ep.current_step],
                ep.action_input[:ep.current_step],
                ep.unit_input[:ep.current_step],
                get_discounted_rewards(ep.rewards[:ep.current_step],
                                       discount_rate=DISCOUNT_RATE),
                ep.nonspatial_action[:ep.current_step],
                ep.spatial_action[:ep.current_step],
                ep.screen_used[:ep.current_step])
        return loss / len(self.recorder)

    def _train(self, screens_input, action_input, select_input, reward, action,
               screen_action, screen_used):
        _entropy = _policy_loss = _value_loss = 0.

        with tf.GradientTape() as tape:
            spatial_policy, ns_policy, value = self.model(
                [screens_input, action_input, select_input])
            value = K.squeeze(value, axis=1)

            ns_action_one_hot = K.one_hot(action, len(ACTION_OPTIONS))
            screen_action_one_hot = K.one_hot(screen_action,
                                              SCREEN_SIZE * SCREEN_SIZE)

            value_loss = .5 * K.square(reward - value)

            entropy = -K.sum(ns_policy * K.log(ns_policy + 1e-10), axis=1) - \
                       K.sum(spatial_policy * K.log(spatial_policy + 1e-10), axis=1)
            ns_log_prob = K.log(
                K.sum(ns_policy * ns_action_one_hot, axis=1) + 1e-10)
            spatial_log_prob = K.log(
                K.sum(spatial_policy * screen_action_one_hot, axis=1) + 1e-10)
            advantage = reward - K.stop_gradient(value)

            # Mask out spatial_log_prob when the action taken did not use the screen
            policy_loss = -(ns_log_prob + spatial_log_prob *
                            screen_used) * advantage - entropy * ENTROPY_RATE

            total_loss = policy_loss + value_loss

            _entropy = K.mean(entropy)
            _policy_loss = K.mean(K.abs(policy_loss))
            _value_loss = K.mean(value_loss)

        gradients = tape.gradient(total_loss, self.model.trainable_variables)
        global_norm = tf.linalg.global_norm(gradients)
        print(tf.linalg.global_norm(gradients))
        gradients, _ = tf.clip_by_global_norm(
            gradients,
            GRADIENT_CLIP_MAX)  # Prevents exploding gradients...I think
        self.opt.apply_gradients(zip(gradients,
                                     self.model.trainable_variables))

        return [
            float(_value_loss),
            float(_policy_loss),
            float(_entropy), global_norm
        ]

    def strip_reshape(self, arr):
        return np.reshape(arr, tuple(s for s in arr.shape if s > 1))

    # Call with game end step and the outcome from the environment
    def step_end(self, obs, outcome):
        last_reward = self.calc_reward(obs, self.last_obs, outcome=outcome)
        self.recorder[self.episode].reward_last_step(last_reward)

    # Takes a state and returns an action, also updates step information
    def step(self, obs, training=True):
        episode = self.recorder[self.episode]

        if self.last_obs:
            last_reward = self.calc_reward(obs, self.last_obs)
            episode.reward_last_step(last_reward)

        screens_input, action_input, select_input = self.build_inputs_from_obs(
            obs)
        spatial_action_policy, ns_action_policy, value = self.model(
            [screens_input, action_input, select_input])

        # Remove dimensions with length 1
        spatial_action_policy = self.strip_reshape(spatial_action_policy)
        ns_action_policy = self.strip_reshape(ns_action_policy)

        if training:
            try:
                screen_choice = np.random.choice(SCREEN_SIZE * SCREEN_SIZE,
                                                 p=spatial_action_policy /
                                                 np.sum(spatial_action_policy))
            except Exception as e:
                print('Error in %s' % self.name)
                raise
        else:
            screen_choice = np.argmax(spatial_action_policy)

        screen_x = screen_choice // SCREEN_SIZE
        screen_y = screen_choice % SCREEN_SIZE

        if training:
            # Select from probability distribution
            choice = np.random.choice(len(ns_action_policy),
                                      p=ns_action_policy)
        else:
            # Select highest probability
            choice = int(np.argmax(ns_action_policy))

        action = ACTION_OPTIONS[choice]
        build_args = []
        # Build action
        for arg in action['args']:
            if arg == 'screen':
                build_args.append([screen_x, screen_y])
            elif arg == 'screen_rect':
                build_args.append([
                    np.max([(screen_x - SELECT_SIZE), 0]),
                    np.max([(screen_y - SELECT_SIZE), 0])
                ])
                build_args.append([
                    np.min([(screen_x + SELECT_SIZE), SCREEN_SIZE - 1]),
                    np.min([(screen_y + SELECT_SIZE), SCREEN_SIZE - 1])
                ])
            elif type(arg) is int:
                build_args.append([arg])
            else:
                raise KeyError('Unrecognized function argument: %s' % arg)

        self.recorder[self.episode].save_step(
            (screens_input, action_input, select_input),
            (spatial_action_policy, ns_action_policy, value), choice,
            screen_choice,
            ('screen' in action['args'] or 'screen_rect' in action['args']))

        self.last_obs = obs
        return actions.FunctionCall(action['id'], build_args)

    def build_inputs_from_obs(self, obs):
        screens_input = np.zeros((SCREEN_DEPTH, SCREEN_SIZE, SCREEN_SIZE),
                                 dtype=np.float32)
        # Transpose feature screens because spatial observations are (y,x) coordinates, everything else is (x,y)
        for ndx, name in enumerate(INPUT_SCREENS):
            if name == 'player_relative':
                screens_input[ndx] = self.convert_player_ids_vect(
                    np.array(obs.observation['feature_screen'][name]))
            elif name == 'unit_type':
                unit_types = np.array(obs.observation['feature_screen'][name])
                screens_input[ndx] = self.convert_unit_ids_vect(unit_types)
            elif name == 'unit_hit_points':
                screens_input[ndx] = np.array(
                    obs.observation['feature_screen'][name]) / UNIT_HP_SCALE
            else:
                screens_input[ndx] = np.array(
                    obs.observation['feature_screen'][name]) / getattr(
                        features.SCREEN_FEATURES, name).scale

        screens_input = np.reshape(screens_input,
                                   (1, SCREEN_SIZE, SCREEN_SIZE, SCREEN_DEPTH))

        # Available actions as array of 1 and 0
        action_input = np.array(
            [(0.
              if act_info['id'] not in obs.observation['available_actions'] or
              (act_info['id'] == actions.FUNCTIONS.select_unit.id
               and act_info['args'][1] >= len(obs.observation['multi_select']))
              else 1.) for act_info in ACTION_OPTIONS],
            dtype=np.float32)
        action_input = np.reshape(action_input, (1, len(ACTION_OPTIONS)))

        # Normalizes the unit select tensor and removes fields
        def convert_select_tensor(x):
            return np.array([
                self.convert_unit_ids(x[0]),
                self.convert_player_ids(x[1]), x[2] / UNIT_HP_SCALE
            ],
                            dtype=np.float32)

        # Selected units
        select_input = np.zeros((MAX_UNIT_SELECT, UNIT_TENSOR_LENGTH),
                                dtype=np.float32)
        for ndx, unit in enumerate(obs.observation['multi_select']):
            select_input[ndx] = convert_select_tensor(unit)
        select_input = np.reshape(select_input,
                                  (1, MAX_UNIT_SELECT * UNIT_TENSOR_LENGTH))

        return screens_input, action_input, select_input

    def calc_reward(self, obs, obs_prev, outcome=0.):
        rw = self.reward_weights

        score = obs.observation['score_by_category']
        score_prev = obs_prev.observation['score_by_category']
        # Difference in army killed minerals and vespene cost minus diff in lost minerals and vespene since last state
        enemy_killed_value = (score[1][1] -
                              score_prev[1][1]) + VESPENE_SCALING * (
                                  score[2][1] - score_prev[2][1])
        friendly_killed_value = (score[3][1] -
                                 score_prev[3][1]) + VESPENE_SCALING * (
                                     score[4][1] - score_prev[4][1])
        diff_value = rw['enemy_killed_value'] * enemy_killed_value - rw[
            'friendly_killed_value'] * friendly_killed_value

        score = obs.observation['score_by_vital']
        score_prev = obs_prev.observation['score_by_vital']
        # Difference in damage dealt minus damage taken since last state
        damage_given = score[0][0] - score_prev[0][0]
        damage_taken = score[1][0] - score_prev[1][0]

        diff_damage = rw['damage_given'] * damage_given - rw[
            'damage_taken'] * damage_taken

        reward = .005 * rw['killed_value'] * diff_value + .01 * rw[
            'damage'] * diff_damage + rw['outcome'] * outcome * .5
        return reward

    def build_model(self,
                    screen_width,
                    screen_height,
                    screen_depth,
                    select_input_length,
                    action_size,
                    training=True):
        K.set_floatx('float32')

        # Inputs
        screen_input = Input(shape=(screen_width, screen_height, screen_depth),
                             dtype='float32')
        action_input = Input(shape=(action_size, ), dtype='float32')
        select_input = Input(shape=(MAX_UNIT_SELECT * select_input_length, ),
                             dtype='float32')

        screen_part = TimeDistributed(
            Conv2D(screen_depth, 5, strides=1, padding='same'))(screen_input)
        screen_part = TimeDistributed(BatchNormalization())(screen_part)
        screen_part = TimeDistributed(Activation('relu'))(screen_part)
        screen_part = TimeDistributed(
            Conv2D(screen_depth, 3, strides=1, padding='same'))(screen_part)
        screen_part = TimeDistributed(BatchNormalization())(screen_part)
        screen_part = TimeDistributed(Activation('relu'))(screen_part)

        action_1 = TimeDistributed(
            Dense(screen_width * screen_height,
                  use_bias=True,
                  activation='relu',
                  name='ingrid'))(action_input)
        action_1 = TimeDistributed(Reshape(
            (screen_width, screen_height, 1)))(action_1)

        select_1 = TimeDistributed(
            Dense(screen_width * screen_height,
                  use_bias=True,
                  activation='relu',
                  name='steve'))(select_input)
        select_1 = TimeDistributed(Reshape(
            (screen_width, screen_height, 1)))(select_1)

        core = TimeDistributed(
            Concatenate(axis=3)([screen_part, action_1, select_1]))
        core = ConvLSTM2D(1,
                          5,
                          strides=1,
                          padding='same',
                          activation='relu',
                          training=training)(core)
        # core = Conv2D(10, 5, strides=1, padding='same')(core)
        # core = BatchNormalization()(core)
        # core = Activation('relu')(core)
        # core = Conv2D(4, 5, strides=1, padding='same')(core)
        # core = BatchNormalization()(core)
        # core = Activation('relu')(core)

        action_policy = TimeDistributed(
            Conv2D(1, 3, strides=2, padding='same', activation='relu'))(core)
        action_policy = TimeDistributed(Flatten())(action_policy)
        action_policy = TimeDistributed(
            Dense(action_size * 2, use_bias=True,
                  activation='relu'))(action_policy)
        if training:
            action_policy = TimeDistributed(
                Dropout(DROPOUT_RATE))(action_policy)
        action_policy = TimeDistributed(
            Dense(action_size * 2, use_bias=True,
                  activation='relu'))(action_policy)
        if training:
            action_policy = TimeDistributed(
                Dropout(DROPOUT_RATE))(action_policy)
        action_policy = TimeDistributed(Dense(action_size))(action_policy)
        # Mask out unavailable actions and softmax
        action_policy = K.exp(action_policy) * action_input / (K.sum(
            K.exp(action_policy) * action_input))

        value = TimeDistributed(Conv2D(1, 5, strides=3,
                                       activation='relu'))(core)
        value = TimeDistributed(Flatten())(value)
        if training:
            value = TimeDistributed(Dropout(DROPOUT_RATE))(value)
        value = TimeDistributed(Dense(50, use_bias=True,
                                      activation='relu'))(value)
        value = TimeDistributed(Dense(1))(value)

        # Concat in the action policy to inform the screen policy
        action_policy_dense = TimeDistributed(
            Dense(screen_width * screen_height,
                  use_bias=True,
                  activation='relu'))(K.stop_gradient(action_policy))
        action_policy_dense = TimeDistributed(
            Reshape((screen_width, screen_height, 1)))(action_policy_dense)
        screen_core = TimeDistributed(
            Concatenate(axis=3))([core, action_policy_dense])
        screen_policy = TimeDistributed(Conv2D(5, 3,
                                               padding='same'))(screen_core)
        screen_policy = TimeDistributed(BatchNormalization())(screen_policy)
        screen_policy = TimeDistributed(Activation('relu'))(screen_policy)
        screen_policy = TimeDistributed(Conv2D(1, 3,
                                               padding='same'))(screen_policy)
        screen_policy = TimeDistributed(Flatten())(screen_policy)
        screen_policy = TimeDistributed(Activation('softmax'))(screen_policy)

        model = Model([screen_input, action_input, select_input],
                      [screen_policy, action_policy, value])

        return model
コード例 #5
0
class Critic():
    def __init__(self,
                 ALPHA,
                 lambda_=0,
                 Gamma=0.99,
                 n_actions=4,
                 layer1_size=16,
                 layer2_size=16,
                 input_dims=8):

        self.gamma = Gamma
        # self.lr = ALPHA
        self.lambda_ = lambda_
        self.input_dims = input_dims
        self.h1_dims = layer1_size
        self.h2_dims = layer2_size
        self.n_actions = n_actions

        self.critic = self.build_polic_network()

        self.optimizer = RMSprop(learning_rate=ALPHA)

        self.actions_space = [i for i in range(n_actions)]

    def build_polic_network(self):
        #Build the Network
        input = Input(shape=(self.input_dims, ))

        # no hidden layer
        if (self.h1_dims == 0 and self.h2_dims == 0):
            Q_values = Dense(self.n_actions, activation='linear')(input)
        #One hidden layer
        elif (self.h1_dims != 0 and self.h2_dims == 0):
            dense1 = Dense(self.h1_dims,
                           activation='relu',
                           kernel_regularizer=l2(0.01))(input)
            Q_values = Dense(self.n_actions, activation='linear')(dense1)
        #Two hidden layers
        else:
            dense1 = Dense(self.h1_dims,
                           activation='relu',
                           kernel_regularizer=l2(0.01))(input)
            dense2 = Dense(self.h2_dims,
                           activation='relu',
                           kernel_regularizer=l2(0.01))(dense1)
            Q_values = Dense(self.n_actions, activation='linear')(dense2)

        critic = Model(inputs=[input], outputs=[Q_values])
        critic.summary()

        #Eligibilty traces are intialized to zero
        # tvs = critic.trainable_variables
        # self.eligibilty = [tf.Variable(tf.zeros_like(tv.initialized_value()), trainable=False) for tv in tvs]

        return critic

    def initialize_eligibility(self, observation, action):

        state = observation[np.newaxis, :]
        #Get gradient of Q function
        with tf.GradientTape() as tape:
            Qvalues = self.critic(state)
            tvs = self.critic.trainable_variables
            Q = Qvalues[0, action]
        #Calculating Gradient on Q of current state and action with respect to weights(bias included) of the network
        grads = tape.gradient(Q, tvs)

        self.eligibilty = grads

    def learn(self, reward, next_state, next_action, Q, done):
        # weights = self.critic.get_weights()

        #Get gradient of Q function
        with tf.GradientTape() as tape:
            Qvalues = self.critic(next_state)
            tvs = self.critic.trainable_variables
            next_Q = Qvalues[0, next_action]
        #Calculating Gradient on Q of current state and action with respect to weights(bias included) of the network
        grads = tape.gradient(next_Q, tvs)

        Q_ = np.array(next_Q)
        # print(Q,Q_)
        #When done is true no need to take value of next state and change only the target value of present action
        TD_error = reward + self.gamma * Q_ * (1 - int(done)) - Q

        #Update weights
        # weights = weights + self.lr * TD_error * self.eligibilty
        td_el = TD_error * self.eligibilty

        for grad_el in td_el:
            norm = np.linalg.norm(grad_el)
            if norm != 0.0:
                grad_el = grad_el / norm
            # print("he")

        self.optimizer.apply_gradients(
            zip(td_el, self.critic.trainable_variables))

        #Update Eligibility Traces
        # self.eligibilty = [self.gamma * self.lambda_ * elg for elg in self.eligibilty]
        self.eligibilty = [
            (self.gamma * self.lambda_ * self.eligibilty[i]) + grad
            for i, grad in enumerate(grads)
        ]

        # print(TD_error)

    def save_model(self, name):
        self.critic.save(name)

    def load_model(self, name):
        self.critic = load_model(name)
コード例 #6
0
class PPO:
    def __init__(self,
                 action_dim,
                 k,
                 clip_norm=None,
                 optim="adam",
                 write_weights=False,
                 gamma=0.9,
                 eps=0.2,
                 actor_lr=0.0001,
                 critic_lr=0.0002,
                 actor_update_steps=10,
                 critic_update_steps=10):
        self.action_dim = action_dim
        self.k = k
        self.gamma = gamma
        self.eps = eps
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr
        self.actor_update_steps = actor_update_steps
        self.critic_update_steps = critic_update_steps
        self.actor = Actor(action_dim, k)
        self.critic = Critic()
        if optim == "adam":
            self.actor_optim = Adam(actor_lr, clipnorm=clip_norm) \
                                if clip_norm is not None else Adam(actor_lr)
            self.critic_optim = Adam(critic_lr, clipnorm=clip_norm) \
                                if clip_norm is not None else Adam(critic_lr)
        elif optim == "rms":
            self.actor_optim = RMSprop(actor_lr, clipnorm=clip_norm) \
                if clip_norm is not None else RMSprop(actor_lr)
            self.critic_optim = RMSprop(critic_lr, clipnorm=clip_norm) \
                if clip_norm is not None else RMSprop(critic_lr)
        self.actor_old = Actor(action_dim, k)
        self.write_weights = write_weights

    def choose_action(self, state):
        # currently the state should be (1, s_dim = |V| * 4)
        action_probs = self.actor.predict(state)  # (1, a_dim = |E|)
        dist = tfd.Categorical(probs=action_probs)
        action = self.sample_without_replacement(action_probs,
                                                 self.k)  # (1, k)
        action = tf.squeeze(action)
        return action.numpy().tolist(), tf.clip_by_value(tf.squeeze(
            tf.math.reduce_prod(dist.prob(action)) / \
            (self.actor.normalizer(action_probs, self.k)).numpy() +
                                  self.actor.eps), 1e-16, 1)
        # (k,), ()

    def sample_without_replacement(self, p, k):
        z = -tf.math.log(-tf.math.log(tf.random.uniform(tf.shape(p), 0, 1)))
        z = tf.cast(z, tf.double)
        pr = tf.cast(p, tf.double)
        _, indices = tf.math.top_k(tf.math.log(pr) + z, k)
        return indices

    def get_v(self, state):
        v_tensor = self.critic(state)  # (1, 1)
        return v_tensor.numpy()[0, 0]

    def update(self, memory: Memory, discounted_rewards, writer, name, step):
        self.actor_old.set_weights(self.actor.get_weights())
        # pi_old_a, pi_old = self.actor_old.evaluate_probs(memory.states,
        #                                           memory.actions)
        pi_old_a = np.array(memory.probs)
        memory_state_values = self.critic(np.array(memory.states))
        memory_state_values = tf.squeeze(memory_state_values)  # (N, )
        discounted_rewards_arr = np.array(discounted_rewards)
        advantages = discounted_rewards_arr - memory_state_values

        for au in range(self.actor_update_steps):
            with tf.GradientTape() as ag:
                pi_theta_a = self.actor.evaluate_probs(memory.states,
                                                       memory.actions)
                # pi_old = tf.stop_gradient(tf.convert_to_tensor(memory.probs))
                ratios = pi_theta_a / pi_old_a  # (N,)
                surr1 = ratios * advantages
                surr2 = tf.clip_by_value(ratios, 1 - self.eps,
                                         1 + self.eps) * advantages
                actor_loss = -tf.reduce_mean(tf.minimum(surr1, surr2))
            gradient = ag.gradient(actor_loss, self.actor.trainable_variables)
            self.actor_optim.apply_gradients(
                zip(gradient, self.actor.trainable_variables))
            #kl_div = tf.keras.losses.KLDivergence()(pi_old, pi_theta)
            #with writer.as_default():
            #    tf.summary.scalar("kl div", kl_div.numpy(), step=au)

        for cu in range(self.critic_update_steps):
            with tf.GradientTape() as cg:
                memory_state_values = self.critic(np.array(memory.states))
                memory_state_values = tf.squeeze(memory_state_values)
                advantages = discounted_rewards_arr - memory_state_values
                critic_loss = tf.reduce_mean(tf.square(advantages))

            gradient = cg.gradient(critic_loss,
                                   self.critic.trainable_variables)
            self.critic_optim.apply_gradients(
                zip(gradient, self.critic.trainable_variables))

    def init_ac(self, state):
        self.actor_old.predict(state)
        self.actor.predict(state)
        self.critic.predict(state)

    def load_ac(self, a_weights, c_weights):
        self.actor_old.load_weights(a_weights)
        self.actor.load_weights(a_weights)
        self.critic.load_weights(c_weights)
コード例 #7
0
class CGAN:
    """Generate y conditioned on x."""
    def __init__(self,
                 x_features,
                 y_features,
                 latent_dim=32,
                 g_hidden=32,
                 d_hidden=32,
                 label_smooth=0.9,
                 d_dropout=0.1,
                 gp_weight=1,
                 ds_weight=1):
        self.x_features = x_features
        self.y_features = y_features

        self.latent_dim = latent_dim
        self.g_hidden = g_hidden
        self.d_hidden = d_hidden
        self.label_smooth = label_smooth
        self.d_dropout = d_dropout
        self.gp_weight = gp_weight
        self.ds_weight = ds_weight

        self.g_optimizer = Adam(0.0001)
        self.d_optimizer = RMSprop(0.0001)
        self.generator = self.build_generator()
        self.discriminator = self.build_discriminator()

    def build_generator(self):
        """Generator model consists of a dense layer after each component."""
        noise = Input(shape=(self.latent_dim, ))  # noise
        d_noise = Dense(self.g_hidden)(noise)
        x = Input(shape=(self.x_features, ))  # condition
        d_x = Dense(self.g_hidden)(x)
        z = Concatenate()([d_noise, d_x])
        d_z = Dense(self.g_hidden)(z)
        y = Dense(self.y_features)(d_z)
        return Model([noise, x], y)

    def build_discriminator(self):
        """Discriminator model consists of a dense layer after each component."""
        x = Input(shape=(self.x_features))  # condition
        d_x = Dense(self.d_hidden)(x)
        y = Input(shape=(self.y_features))  # y
        d_y = Dense(self.d_hidden)(y)
        h = Concatenate()([d_x, d_y])
        h = Dense(self.d_hidden)(h)
        h = Dropout(self.d_dropout)(h)
        p = Dense(1)(h)
        return Model([y, x], p)

    def g_loss(self, fake_pred):
        return -tf.math.reduce_mean(fake_pred)

    def d_loss(self, real_pred, fake_pred):
        return -tf.math.reduce_mean(
            real_pred * self.label_smooth) + tf.math.reduce_mean(fake_pred)

    def loss(self, X, y):
        noise = tf.random.normal((X.shape[0], self.latent_dim))
        fake_y = self.generator([noise, X])
        fake_pred = self.discriminator([fake_y, X])
        return self.g_loss(fake_pred)

    def gradient_penalty(self, real_y, fake_y, X):
        """Gradient penalty on discriminator"""
        batch_size = real_y.shape[0]
        epsilon = tf.random.normal([batch_size, self.y_features], 0.0, 1.0)
        interpolate_y = epsilon * real_y + (1 - epsilon) * fake_y

        with tf.GradientTape() as gp_tape:
            gp_tape.watch(interpolate_y)
            pred = self.discriminator([interpolate_y, X], training=True)

        gradients = gp_tape.gradient(pred, [interpolate_y])
        norm = tf.sqrt(tf.reduce_sum(tf.square(gradients), axis=1))
        gp = tf.reduce_mean((norm - 1.0)**2)
        return gp * self.gp_weight

    def diversity_score(self, X):
        batch_size = X.shape[0]
        z1 = tf.random.normal([batch_size, self.latent_dim])
        z2 = tf.random.normal([batch_size, self.latent_dim])
        y1 = self.generator([z1, X], training=True)
        y2 = self.generator([z2, X], training=True)
        denom = tf.reduce_mean(tf.abs(z1 - z2), axis=1)
        numer = tf.reduce_mean(tf.abs(y1 - y2), axis=1)
        ds = tf.reduce_mean(numer / denom)
        return tf.math.minimum(
            ds, 0.1) * self.ds_weight  # lower bound for numerical stability

    @tf.function
    def train_step(self, X, real_y):
        noise = tf.random.normal((X.shape[0], self.latent_dim))

        with tf.GradientTape() as g_tape, tf.GradientTape() as d_tape:
            fake_y = self.generator([noise, X], training=True)

            real_pred = self.discriminator([real_y, X], training=True)
            fake_pred = self.discriminator([fake_y, X], training=True)

            gp = self.gradient_penalty(real_y, fake_y, X)
            ds = self.diversity_score(X)

            g_loss = self.g_loss(fake_pred) - ds
            d_loss = self.d_loss(real_pred, fake_pred) + gp

        g_gradients = g_tape.gradient(g_loss,
                                      self.generator.trainable_variables)
        d_gradients = d_tape.gradient(d_loss,
                                      self.discriminator.trainable_variables)
        self.g_optimizer.apply_gradients(
            zip(g_gradients, self.generator.trainable_variables))
        self.d_optimizer.apply_gradients(
            zip(d_gradients, self.discriminator.trainable_variables))

        return g_loss, d_loss

    def fit(self, X, y, epochs=1000, verbose=1, plot=False, logdir='cgan'):
        # Tensorboard
        current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
        train_log_dir = 'logs/' + logdir + '/' + current_time
        train_summary_writer = tf.summary.create_file_writer(train_log_dir)

        for epoch in range(epochs):
            g_loss, d_loss = self.train_step(X, y)
            with train_summary_writer.as_default():
                tf.summary.scalar('Generator Loss', g_loss, step=epoch)
                tf.summary.scalar('Discriminator Loss', d_loss, step=epoch)
            if verbose and epoch % (epochs // 10) == 0:
                print(f"{epoch} [D loss: {d_loss}] [G loss: {g_loss}]")

    def predict(self, X):
        noise = tf.random.normal((X.shape[0], self.latent_dim))
        return self.generator([noise, X]).numpy()
コード例 #8
0
class Trainer:
    def __init__(self,
                 util: Utils,
                 hr_size=96,
                 log_dir: str = None,
                 num_resblock: int = 16):
        self.vgg = self.vgg(20)
        self.learning_rate = 0.00005
        self.clipping = 0.01
        self.generator_optimizer = RMSprop(learning_rate=self.learning_rate,
                                           clipvalue=self.clipping)
        self.discriminator_optimizer = RMSprop(
            learning_rate=self.learning_rate, clipvalue=self.clipping)
        self.binary_cross_entropy = BinaryCrossentropy(from_logits=True)
        self.mean_squared_error = MeanSquaredError()
        self.util: Utils = util
        self.HR_SIZE = hr_size
        self.LR_SIZE = self.HR_SIZE // 4

        if log_dir is not None:
            self.summary_writer = tf.summary.create_file_writer(log_dir)
            if log_dir.startswith('../'):
                log_dir = log_dir[len('../'):]
            print('open tensorboard with: tensorboard --logdir ' + log_dir)

        else:
            self.summary_writer = None

        self.generator = make_generator_model(num_res_blocks=num_resblock)
        self.discriminator = make_discriminator_model(self.HR_SIZE)
        self.checkpoint = tf.train.Checkpoint(generator=self.generator,
                                              discriminator=self.discriminator)

    def summary(self):
        print('Discrimantor:')
        print(self.discriminator.summary())
        print('Generator: \n')
        print(self.generator.summary())

    def vgg(self, output_layer):
        vgg = VGG19(input_shape=(None, None, 3), include_top=False)
        return Model(vgg.input, vgg.layers[output_layer].output)

    def train_generator(self,
                        train_dataset,
                        valid_dataset,
                        epochs=20000,
                        valid_lr=None,
                        valid_hr=None):
        evaluate_size = epochs / 10

        loss_mean = Mean()

        start_time = time.time()
        epoch = 0

        for lr, hr in train_dataset.take(epochs):
            epoch += 1
            step = tf.convert_to_tensor(epoch, dtype=tf.int64)
            generator_loss = self.train_generator_step(lr, hr)
            loss_mean(generator_loss)

            if epoch % 50 == 0:
                loss_value = loss_mean.result()
                loss_mean.reset_states()

                psnr_value = self.evaluate(valid_dataset.take(1))

                print(
                    f'Time for epoch {epoch}/{epochs} is {(time.time() - start_time):.4f} sec, '
                    f'gan loss = {loss_value:.4f}, psnr = {psnr_value:.4f}')
                start_time = time.time()

                if self.summary_writer is not None:
                    with self.summary_writer.as_default():
                        tf.summary.scalar('generator_loss',
                                          loss_value,
                                          step=epoch)
                        tf.summary.scalar('psnr', psnr_value, step=epoch)

            if epoch % evaluate_size == 0:
                self.util.save_checkpoint(self.checkpoint, epoch)

            if epoch % 5000 == 0:
                self.generate_and_save_images(step, valid_lr, valid_hr)

    def train_gan(self,
                  train_dataset,
                  valid_dataset,
                  epochs=200000,
                  valid_lr=None,
                  valid_hr=None):
        evaluate_size = epochs / 10
        start = time.time()
        vgg_metric = Mean()
        dls_metric = Mean()
        g_metric = Mean()
        c_metric = Mean()
        epoch = 0

        for lr, hr in train_dataset.take(epochs):
            epoch += 1
            step = tf.convert_to_tensor(epoch, tf.int64)
            vgg_loss, discremenator_loss, generator_loss, content_loss = self.train_gan_step(
                lr, hr)
            vgg_metric(vgg_loss)
            dls_metric(discremenator_loss)
            g_metric(generator_loss)
            c_metric(content_loss)

            if epoch % 50 == 0:
                vgg = vgg_metric.result()
                discriminator_loss_metric = dls_metric.result()
                generator_loss_metric = g_metric.result()
                content_loss_metric = c_metric.result()

                vgg_metric.reset_states()
                dls_metric.reset_states()
                g_metric.reset_states()
                c_metric.reset_states()

                psnr_value = self.evaluate(valid_dataset.take(1))

                print(
                    f'Time for epoch {epoch}/{epochs} is {(time.time() - start):.4f} sec, '
                    f' perceptual loss = {vgg:.4f},'
                    f' generator loss = {generator_loss_metric:.4f},'
                    f' discriminator loss = {discriminator_loss_metric:.4f},'
                    f' content loss = {content_loss_metric:.4f},'
                    f' psnr = {psnr_value:.4f}')

                start = time.time()

                if self.summary_writer is not None:
                    with self.summary_writer.as_default():
                        tf.summary.scalar('generator_loss',
                                          generator_loss_metric,
                                          step=epoch)
                        tf.summary.scalar('content loss',
                                          content_loss_metric,
                                          step=epoch)
                        tf.summary.scalar(
                            'vgg loss = content loss + 0.0001 * gan loss',
                            vgg,
                            step=epoch)
                        tf.summary.scalar('discremenator_loss',
                                          discriminator_loss_metric,
                                          step=epoch)
                        tf.summary.scalar('psnr', psnr_value, step=epoch)

            if epoch % evaluate_size == 0:
                self.util.save_checkpoint(self.checkpoint, epoch)

            if epoch % 5000 == 0:
                self.generate_and_save_images(step, valid_lr, valid_hr)

    @tf.function
    def train_generator_step(self, lr, hr):
        with tf.GradientTape() as tape:
            lr = tf.cast(lr, tf.float32)
            hr = tf.cast(hr, tf.float32)

            fake_image = self.generator(lr, training=True)
            loss_value = self.mean_squared_error(hr, fake_image)

        gradients = tape.gradient(loss_value,
                                  self.generator.trainable_variables)
        self.generator_optimizer.apply_gradients(
            zip(gradients, self.generator.trainable_variables))

        return loss_value

    @tf.function
    def train_gan_step(self, lr, hr):
        with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:
            lr = tf.cast(lr, tf.float32)
            hr = tf.cast(hr, tf.float32)

            fake_image = self.generator(lr, training=True)

            real_classification = self.discriminator(hr, training=True)
            fake_classification = self.discriminator(fake_image, training=True)

            content_loss = self.content_loss(hr, fake_image)
            generator_loss = self.generator_loss(fake_image)
            # lpips_loss = self.lpips_loss(hr, fake_image)
            vgg_loss = content_loss + 0.001 * generator_loss
            # print('lpips: ' + str(lpips_loss))
            # loss = generator_loss + 100 * lpips_loss
            discremenator_loss = self.discriminator_loss(
                real_classification, fake_classification)

            gradients_of_generator = gen_tape.gradient(
                vgg_loss, self.generator.trainable_variables)
            gradients_of_discriminator = disc_tape.gradient(
                discremenator_loss, self.discriminator.trainable_variables)

            self.generator_optimizer.apply_gradients(
                zip(gradients_of_generator,
                    self.generator.trainable_variables))
            self.discriminator_optimizer.apply_gradients(
                zip(gradients_of_discriminator,
                    self.discriminator.trainable_variables))

        return vgg_loss, discremenator_loss, generator_loss, content_loss

    # Loss functions:

    def lpips_loss(self, hr, fake_image):
        nhr = hr.numpy()
        nfi = fake_image.numpy()
        print(nhr.shape)
        print(nfi.shape)
        return self.loss_fn_vgg(nhr, nfi)

    @tf.function
    def content_loss(self, hr, fake_image):
        fake_image = preprocess_input(fake_image)
        hr = preprocess_input(hr)
        fake_features = self.vgg(fake_image) / 12.75
        hr_features = self.vgg(hr) / 12.75
        return self.mean_squared_error(hr_features, fake_features)

    @tf.function
    def discriminator_loss(self, real_class, fake_class):
        # hr_loss = self.binary_cross_entropy(tf.ones_like(real_class), real_class)
        # fake_loss = self.binary_cross_entropy(tf.zeros_like(fake_class), fake_class)
        # return hr_loss + fake_loss
        return tf.reduce_mean(fake_class) - tf.reduce_mean(real_class)

    @tf.function
    def generator_loss(self, fake_class):
        gan_loss = -tf.reduce_mean(fake_class)
        # gan_loss = self.binary_cross_entropy(tf.ones_like(fake_class), fake_class)
        return gan_loss

    # Helper
    def save_model(self, appendix=''):
        self.util.save_model(self.generator, 'generator' + appendix)
        self.util.save_model(self.discriminator, 'discriminator' + appendix)

    def generate_and_save_images(self, step, lr, hr):
        epoch = tf.cast(step, tf.int64)
        plt.close('all')
        generated = self.util.resolve_single(self.generator, lr)

        plt.figure(figsize=(15, 30), clear=True)
        figures = [lr, generated, hr]
        titles = ['LR', 'Generated', 'HR']
        for i in range(3):
            plt.subplot(3, 1, 1 + i)
            plt.title(titles[i])
            plt.imshow(figures[i] / 255)
            plt.axis('off')
            plt.xticks([])
            plt.yticks([])

        fig = plt.gcf()
        self.util.save_figure(fig, epoch)

    def evaluate(self, dataset):
        psnr_values = []
        for lr, hr in dataset:
            sr = self.util.resolve(self.generator, lr)
            psnr_value = self.psnr(hr, sr)[0]
            psnr_values.append(psnr_value)
        return tf.reduce_mean(psnr_values)

    def psnr(self, x1, x2):
        return tf.image.psnr(x1, x2, max_val=255)

    def load_generator(self, file):
        self.generator.load_weights(file)

    def load_discriminator(self, file):
        self.discriminator.load_weights(file)

    def load_checkpoint(self, file):
        self.checkpoint.restore(
            tf.train.latest_checkpoint(file)).assert_consumed()
コード例 #9
0
ファイル: agent.py プロジェクト: ziqian2000/AtariGamer
class Agent:
    def __init__(self, env_id, debug, use_DDQN, mem_limit, render):
        # hyper-parameters
        self.discount_factor = 0.99
        self.minibatch_size = 32
        self.update_frequency = 4
        self.target_network_update_frequency = 10000 if not use_DDQN else 30000
        self.history_len = 4
        self.memory_size = int(
            0.9 * (float(mem_limit) * (1024**3) /
                   ((84 * 84 * 4 * 2 + 4 + 4 + 1) +
                    (2 * 4)) if mem_limit else 130000)) if not debug else 5000
        self.init_explr = 1.0
        self.final_explr = 0.1
        self.final_explr_frame = 1000000
        self.terminal_explr = 0.01
        self.terminal_explr_frame = self.final_explr_frame * 10
        self.replay_start_size = 50000 if not debug else 3000
        self.training_frames = int(1e7)
        self.learning_rate = 0.00025
        self.momentum = 0.95
        self.frame_skip = 4

        # frames limit
        self.fps = 60
        self.max_playing_time = 5  # minutes
        self.total_frames_limit = self.fps * 60 * self.max_playing_time

        # environment
        self.env_id = env_id
        self.env = AtariEnvironment(self.env_id, self.total_frames_limit)

        # other parameters
        self.action_num = self.env.get_action_num()
        self.latest_record_num = 100
        self.print_info_interval = 20 if not debug else 1
        self.save_weight_interval = 200 if not debug else 3
        self.highest_score = -1e9
        self.use_DDQN = use_DDQN
        self.render = True if render else False

        # network
        self.memory = PrioritizedReplayMemory(
            minibatch_size=self.minibatch_size,
            memory_size=self.memory_size,
            history_len=self.history_len)
        self.main_network = Network(action_num=self.action_num,
                                    history_len=self.history_len)
        self.target_network = Network(action_num=self.action_num,
                                      history_len=self.history_len)
        # self.optimizer = Adam(lr=self.learning_rate, epsilon=1e-6)
        self.optimizer = RMSprop(learning_rate=self.learning_rate,
                                 momentum=self.momentum,
                                 epsilon=1e-2)
        self.loss = tf.keras.losses.Huber()
        self.loss_metric = tf.keras.metrics.Mean()
        self.q_metric = tf.keras.metrics.Mean()

        # other tools (log, summary)
        self.log_path = ("drive/My Drive/AtariGamer/" if not debug else
                         "./") + "log/" + datetime.now().strftime(
                             "%Y%m%d_%H%M%S") + "_" + self.env_id

        print("- DDQN:", ("YES" if self.use_DDQN else "NO"))

    @tf.function
    def get_action(self, state, exploration_rate):
        """
        get action by ε-greedy algorithm
        :param state: current state
        :param exploration_rate: current exploration rate
        :return: action, an integer
        """
        if tf.random.uniform(
            (), minval=0, maxval=1, dtype=tf.float32
        ) < exploration_rate:  # explore: randomly choose action
            action = tf.random.uniform((),
                                       minval=0,
                                       maxval=self.action_num,
                                       dtype=tf.int32)
        else:
            q_value = self.main_network(
                tf.cast(tf.expand_dims(state, axis=0), tf.float32))
            action = tf.cast(tf.squeeze(tf.argmax(q_value, axis=1)),
                             dtype=tf.int32)
        return action

    @tf.function
    def get_explr(self, frames):
        """
        get exploration rate using linear annealing
        :param frames: the number of frames passed
        :return: exploration rate, a float
        """
        if frames < self.replay_start_size:
            explr = self.init_explr
        elif frames < self.final_explr_frame:
            explr = self.init_explr + (self.final_explr - self.init_explr) / (
                self.final_explr_frame -
                self.replay_start_size) * (frames - self.replay_start_size)
        elif frames < self.terminal_explr_frame:
            explr = self.final_explr + (
                self.terminal_explr - self.final_explr) / (
                    self.terminal_explr_frame -
                    self.final_explr_frame) * (frames - self.final_explr_frame)
        else:
            explr = self.terminal_explr
        return explr

    @tf.function
    def update_main_network_natural(self, state_batch, action_batch,
                                    reward_batch, next_state_batch,
                                    terminated_batch, weight_batch):
        """
        update main Q network by experience replay
        :param weight_batch: importance sampling weight
        :param state_batch: batch of states
        :param action_batch: batch of actions
        :param reward_batch: batch of rewards
        :param next_state_batch: batch of next states
        :param terminated_batch: batch of whether it is terminated
        :return: Huber loss
        """
        with tf.GradientTape() as tape:
            next_state_q = self.target_network(next_state_batch)
            next_state_max_q = tf.reduce_max(next_state_q, axis=1)
            expected_q = reward_batch + self.discount_factor * next_state_max_q * (
                1.0 - tf.cast(terminated_batch, tf.float32))

            main_q = tf.reduce_sum(self.main_network(state_batch) * tf.one_hot(
                action_batch, self.action_num, on_value=1.0, off_value=0.0),
                                   axis=1)
            loss = self.loss(tf.stop_gradient(expected_q), main_q,
                             weight_batch)
        gradients = tape.gradient(loss, self.main_network.trainable_variables)
        clipped_gradients = [tf.clip_by_norm(grad, 10) for grad in gradients]
        self.optimizer.apply_gradients(
            zip(clipped_gradients, self.main_network.trainable_variables))

        self.loss_metric.update_state(loss)
        self.q_metric.update_state(main_q)
        return tf.abs(main_q - expected_q)

    @tf.function
    def update_main_network_DDQN(self, state_batch, action_batch, reward_batch,
                                 next_state_batch, terminated_batch,
                                 weight_batch):
        """
        update main Q network by experience replay
        :param weight_batch: importance sampling weight
        :param state_batch: batch of states
        :param action_batch: batch of actions
        :param reward_batch: batch of rewards
        :param next_state_batch: batch of next states
        :param terminated_batch: batch of whether it is terminated
        :return: Huber loss
        """
        with tf.GradientTape() as tape:

            main_next_state_q_list = self.main_network(next_state_batch)
            target_next_state_q_list = self.target_network(next_state_batch)

            max_action = tf.argmax(main_next_state_q_list, axis=-1)
            next_state_q = tf.reduce_sum(target_next_state_q_list * tf.one_hot(
                max_action, self.action_num, on_value=1.0, off_value=0.0),
                                         axis=1)
            expected_q = reward_batch + self.discount_factor * next_state_q * (
                1.0 - tf.cast(terminated_batch, tf.float32))

            main_q_list = self.main_network(state_batch)
            main_q = tf.reduce_sum(main_q_list * tf.one_hot(
                action_batch, self.action_num, on_value=1.0, off_value=0.0),
                                   axis=1)

            loss = self.loss(tf.stop_gradient(expected_q), main_q,
                             weight_batch)

        gradients = tape.gradient(loss, self.main_network.trainable_variables)
        clipped_gradients = [tf.clip_by_norm(grad, 10) for grad in gradients]
        self.optimizer.apply_gradients(
            zip(clipped_gradients, self.main_network.trainable_variables))

        self.loss_metric.update_state(loss)
        self.q_metric.update_state(main_q)
        return tf.abs(main_q - expected_q)

    @tf.function
    def update_target_network(self):
        """
        synchronize weights of target network with main network
        """
        main_weights = self.main_network.trainable_variables
        target_weights = self.target_network.trainable_variables
        for main_v, target_v in zip(main_weights, target_weights):
            target_v.assign(main_v)

    def train(self, load_path=None):

        if load_path:
            loaded_checkpoints = tf.train.latest_checkpoint(load_path)
            self.main_network.load_weights(loaded_checkpoints)
            self.target_network.load_weights(loaded_checkpoints)

            self.init_explr = self.terminal_explr
            self.final_explr = self.terminal_explr
            self.memory.beta = self.memory.beta0

        frames = 0
        episodes = 0
        latest_scores = deque(maxlen=self.latest_record_num)

        while frames < self.training_frames:

            cur_state = self.env.reset()
            episode_reward = 0
            terminated = False
            last_action = 0

            while not terminated:

                if frames % self.frame_skip == 0:
                    action = last_action
                else:
                    explr = self.get_explr(
                        tf.constant(frames, dtype=tf.float32))
                    action = self.get_action(
                        tf.constant(cur_state, dtype=tf.uint8),
                        tf.constant(explr, dtype=tf.float32))
                    last_action = action

                next_state, reward, terminated, _ = self.env.step(action)
                episode_reward += reward

                self.memory.push(cur_state, action, reward, next_state,
                                 terminated)
                cur_state = next_state

                if frames > self.replay_start_size:
                    if frames % self.update_frequency == 0:
                        (state_batch, action_batch, reward_batch, next_state_batch, terminated_batch), \
                            ptr_batch, imp_samp_weight_batch = self.memory.sample()

                        update_func = self.update_main_network_DDQN if self.use_DDQN else self.update_main_network_natural
                        abs_error_batch = update_func(
                            state_batch, action_batch, reward_batch,
                            next_state_batch, terminated_batch,
                            tf.expand_dims(imp_samp_weight_batch, -1))

                        self.memory.update(ptr_batch, abs_error_batch)

                    if frames % self.target_network_update_frequency == 0:
                        self.update_target_network()

                frames += 1

                if terminated:
                    latest_scores.append(episode_reward)
                    episodes += 1

                    if episodes % self.print_info_interval == 0:
                        print(
                            "[" + datetime.now().strftime("%m.%d %H:%M:%S") +
                            "] Episode: {}\t Latest {} average score: {:.2f}\t Progress: {} / {} ( {:.2f} % )"
                            .format(episodes, self.latest_record_num,
                                    np.mean(latest_scores), frames,
                                    self.training_frames, frames /
                                    self.training_frames * 100))

                    if episodes % self.save_weight_interval == 0:
                        average_score = self.play(None, 10)
                        if average_score > self.highest_score:
                            self.highest_score = average_score
                            print("Weights saving...", end="")
                            self.main_network.save_weights(
                                self.log_path +
                                "/score_{}".format(average_score))
                            print("Done!")

    def play(self, load_path, trials):

        if load_path is not None:
            loaded_checkpoints = tf.train.latest_checkpoint(load_path)
            self.main_network.load_weights(loaded_checkpoints)

        env = AtariEnvironment(self.env_id,
                               self.total_frames_limit,
                               clip_rewards=False,
                               episode_life=False)
        reward_list = []
        frame_list = []

        for t in range(trials):

            cur_state = env.reset()
            frames = []
            episode_reward = 0
            terminated = False

            while not terminated:

                if self.render:
                    frames.append(env.render())

                action = self.get_action(
                    tf.constant(cur_state, dtype=tf.uint8),
                    tf.constant(0.0, dtype=tf.float32))

                next_state, reward, terminated, _ = env.step(action)
                episode_reward += reward

                cur_state = next_state

            reward_list.append(episode_reward)
            frame_list.append(frames)

        print("Scores on {} trials: ".format(trials), reward_list)
        print("Highest score: ", np.max(reward_list))
        print("Average score: ", np.mean(reward_list))
        best_idx = int(np.argmax(reward_list))
        if self.render:
            imageio.mimsave(self.env_id + ".gif",
                            frame_list[best_idx],
                            fps=self.fps)
        return np.mean(reward_list)
コード例 #10
0
ファイル: agent.py プロジェクト: dvanhise/penguin-ai
class PenguinAgent:
    def __init__(self):
        self.model = self.build_model()
        self.opt = RMSprop(lr=LEARNING_RATE)
        self.recorder = [Episode()]

    def build_input(self, state):
        map_input = np.zeros((5, 11, 8), dtype=np.float32)
        map_input[0] = state['fish']
        map_input[1] = state['penguins']
        map_input[2] = np.full((11, 8), state['score'][0])
        map_input[3] = np.full((11, 8), state['score'][1])
        map_input[4] = np.full((11, 8), np.float32(state['phase']))

        map_input = np.moveaxis(map_input, -1, 0)  # Change to channels last
        map_input = np.reshape(map_input, (1, 11, 8, 5))
        return map_input

    def step(self, state, player, training=True):
        map_input = self.build_input(state)
        policy, value = self.model([map_input])
        policy = np.squeeze(policy)

        target = None
        destination = None
        mask = np.zeros((11, 8, 2), dtype=np.float32)
        if state['phase'] == 0:
            choices = np.zeros(len(state['placements']))
            for ndx, tile in enumerate(state['placements']):
                mask[tile[0]][tile[1]][0] = 1.0
                choices[ndx] = policy[tile[0]][tile[1]][0]

            choices = K.exp(choices) / (K.sum(K.exp(choices)))
            if training:
                target_ndx = np.random.choice(len(state['placements']), p=choices)
            else:
                target_ndx = np.argmax(choices)
            target = state['placements'][target_ndx]
            destination = None
        elif state['phase'] == 1:
            # TODO: MCTS

            choices = np.zeros(len(state['moves'].keys()))
            options = list(state['moves'].keys())
            for ndx, tile in enumerate(options):
                mask[tile[0]][tile[1]][0] = 1.0
                choices[ndx] = policy[tile[0]][tile[1]][0]

            choices = K.exp(choices) / (K.sum(K.exp(choices)))
            if training:
                target_ndx = np.random.choice(len(options), p=choices)
            else:
                target_ndx = np.argmax(choices)
            target = options[target_ndx]

            choices = np.zeros(len(state['moves'][target]))
            for ndx, tile in enumerate(state['moves'][target]):
                mask[tile[0]][tile[1]][1] = 1.0
                choices[ndx] = policy[tile[0]][tile[1]][1]

            choices = K.exp(choices) / (K.sum(K.exp(choices)))
            if training:
                destination_ndx = np.random.choice(len(state['moves'][target]), p=choices)
            else:
                destination_ndx = np.argmax(choices)
            destination = state['moves'][target][destination_ndx]

        self.recorder[-1].save_step(map_input, value, policy, mask, player, target, destination)

        return target, destination

    def step_end(self, rewards):
        self.recorder[-1].set_rewards(0, rewards[0])
        self.recorder[-1].set_rewards(1, rewards[1])
        self.recorder.append(Episode())

    def train(self):
        loss = np.array([0., 0., 0.])
        loss += self._train(
            np.concatenate([ep.map_input[:ep.current_step] for ep in self.recorder]),
            np.concatenate([ep.reward[:ep.current_step] for ep in self.recorder]),
            np.concatenate([ep.policy_mask[:ep.current_step] for ep in self.recorder]),
            np.concatenate([ep.policy_one_hot[:ep.current_step] for ep in self.recorder])
        )
        self.recorder = [Episode()]  # Clear recorder after training
        return loss

    def _train(self, map_input, reward, policy_mask, policy_one_hot):
        _entropy = _policy_loss = _value_loss = 0.

        policy_mask = policy_mask.astype('float32')
        with tf.GradientTape() as tape:
            policy, value = self.model(map_input)
            value = K.squeeze(value, axis=1)
            policy = K.exp(policy) / (K.sum(K.exp(policy)))

            value_loss = .5 * K.square(reward - value)
            # Should I use policy * policy_mask here?
            entropy = -K.sum(policy * K.log(policy + 1e-10), axis=[1, 2, 3])

            log_prob = K.log(K.sum(policy * policy_one_hot, axis=[1, 2, 3]) + 1e-10)
            advantage = reward - K.stop_gradient(value)

            policy_loss = -log_prob * advantage - entropy * ENTROPY_RATE

            total_loss = policy_loss + value_loss

            _entropy = K.mean(entropy)
            _policy_loss = K.mean(K.abs(policy_loss))
            _value_loss = K.mean(value_loss)

        gradients = tape.gradient(total_loss, self.model.trainable_variables)
        gradients, _ = tf.clip_by_global_norm(gradients, GRADIENT_CLIP_MAX)
        self.opt.apply_gradients(zip(gradients, self.model.trainable_variables))

        return [float(_value_loss), float(_policy_loss), float(_entropy)]

    def build_model(self):
        K.set_floatx('float32')
        map_input = Input(shape=(11, 8, 5), dtype='float32')

        core = Conv2D(10, 3, strides=1, padding='same', input_shape=(11, 8, 5))(map_input)
        core = BatchNormalization()(core)
        core = Activation('relu')(core)
        core = Conv2D(10, 3, strides=1, padding='same')(core)
        core = BatchNormalization()(core)
        core = Activation('relu')(core)
        core = Conv2D(10, 3, strides=1, padding='same')(core)
        core = BatchNormalization()(core)
        core = Activation('relu')(core)

        policy = Conv2D(4, 3, strides=1, padding='same', activation='relu')(core)
        policy = Conv2D(2, 3, strides=1, padding='same')(policy)  # 11 x 8 x 2

        value = Flatten()(core)
        value = Dense(20, use_bias=True)(value)
        value = Dense(1)(value)

        model = Model([map_input], [policy, value])
        return model

    def strip_reshape(self, arr):
        return np.reshape(arr, tuple(s for s in arr.shape if s > 1))
コード例 #11
0
class RL_Brain():
    def __init__(self,
                 n_features,
                 n_action,
                 memory_size=10,
                 batch_size=32,
                 gamma=0.9,
                 fi_size=8):
        self.n_features = n_features
        self.n_actions = n_action
        self.memory_size = memory_size
        self.replay_buffer = np.zeros((self.memory_size, n_features * 2 + 2),
                                      np.float)
        self.count = 0
        self.batch_size = batch_size
        self.gamma = gamma

        self.opt = RMSprop()

        # 由于我们输入的状态向量纬度非常小(仅仅2纬度,因此没有必要再搞个auto-encoder对state进行编码解码),我们直接将state值进行输入。
        self.input_states = Input((self.n_features, ), name='input_states')

        self.branch_1_model = keras.Sequential(
            [Input((2, )), Dense(1, None, False, name='R')])

        self.branch_2_model = [
            keras.Sequential([
                Input((2, )),
                Dense(5, 'relu', name='mu/m%s/layer1' % i),
                Dense(5, 'relu', name='mu/m%s/layer2' % i),
                Dense(2, name='mu/m%s/layer3' % i)
            ],
                             name='branch_%s' % i)
            for i in range(self.n_actions)
        ]

    def learn_w(self, state, r):
        with tf.GradientTape() as tape:
            pred = self.branch_1_model(state)
            loss = mean_squared_error(r, pred)
        grads = tape.gradient(loss, self.branch_1_model.trainable_variables)
        self.opt.apply_gradients(
            zip(grads, self.branch_1_model.trainable_variables))

    def learn_mu(self, state, state_, action_index):
        w = self.branch_1_model.get_layer('R').get_weights()[0]
        mus_ = []
        for i in range(self.n_actions):
            mus_.append(self.branch_2_model[i](state_))
        mus_ = np.squeeze(mus_)
        max_index = np.argmax(np.squeeze(np.matmul(mus_, w)), axis=0)
        with tf.GradientTape() as tape:
            pred = self.branch_2_model[action_index](state)
            label = state + self.gamma * mus_[max_index]
            loss = mean_squared_error(label, pred)
        grads = tape.gradient(
            loss, self.branch_2_model[action_index].trainable_variables)
        self.opt.apply_gradients(
            zip(grads, self.branch_2_model[action_index].trainable_variables))
        self.branch_2_model[action_index] = self.branch_2_model[action_index]

    def choose_action(self, state, is_random=False):
        if is_random:
            return np.random.choice(self.n_actions)
        w = self.branch_1_model.get_layer('R').get_weights()[0]
        mus = []
        for i in range(self.n_actions):
            pred = self.branch_2_model[i](state)
            mus.append(pred)
        mus = np.squeeze(mus)
        rs = np.squeeze(np.matmul(mus, w))
        if len(set(rs)) == 1:
            action_index = np.random.choice(self.n_actions)
        else:
            action_index = np.argmax(rs)
        return action_index

    def append_to_replay_buffer(self, s, a, r, s_):
        transition = np.hstack([s, a, r, s_])
        self.replay_buffer[self.count % self.memory_size] = transition
        self.count += 1