コード例 #1
0
            row.set_cell(column_family_id='trajectory',
                         column='actions'.encode(),
                         value=actions)
            row.set_cell(column_family_id='trajectory',
                         column='rewards',
                         value=rewards)
            rows.append(row)

            if args.log_time is True: time_logger.log("Write Cells      ")

        #UPDATE GLOBAL ITERATOR
        gi_row = cbt_table.row('global_iterator'.encode())
        gi_row.set_cell(column_family_id='global',
                        column='i'.encode(),
                        value=struct.pack('i', row_key_i + 1),
                        timestamp=datetime.datetime.utcnow())

        #ADD TRAJECTORIES AS ROWS TO BIGTABLE
        rows.append(gi_row)
        cbt_batcher.mutate_rows(rows)
        cbt_batcher.flush()
        rows = []

        if args.log_time is True: time_logger.log("Mutate Rows      ")

        print("-> Saved trajectories {} - {}.".format(
            row_key_i - (args.num_episodes - 1), row_key_i))

        if args.log_time is True: time_logger.print_totaltime_logs()
    env.close()
    print("-> Done!")
コード例 #2
0
class DQN_Agent():
    """
    Class for controlling and managing training from a bigtable database.
      
    Attributes:
        cbt_table (google.cloud.bigtable.Table): Bigtable table object returned from [util.gcp_io.cbt_load_table].
        gcs_bucket (google.cloud.storage.Bucket): GCS bucket object returned from [util.gcp_io.gcs_load_bucket].
        gcs_bucket_id (str): Global name of the GCS bucket where the model will be saved/loaded.
        prefix (str): Prefix used for model and trajectory names.
        tmp_weights_filepath (str): Temporary local path for saving model before copying to GCS.
        buffer_size (int): Max size of the experience buffer.
        batch_size (int): Batch size for estimator.
        train_epochs (int): Number of cycles of querying bigtable and training.
        train_steps (int): Number of train steps per epoch.
        period (int): Interval for saving models.
        output_dir (str): Output directory for logs and models.
        log_time (bool): Flag for time logging.
        num_gpus (int): Number of gpu devices for estimator.
    """
    def __init__(self, **kwargs):
        """
        The constructor for DQN_Agent class.

        """
        hyperparams = kwargs['hyperparams']
        self.input_shape = hyperparams['input_shape']
        self.num_actions = hyperparams['num_actions']
        self.gamma = hyperparams['gamma']
        self.cbt_table = kwargs['cbt_table']
        self.gcs_bucket = kwargs['gcs_bucket']
        self.gcs_bucket_id = kwargs['gcs_bucket_id']
        self.prefix = kwargs['prefix']
        self.tmp_weights_filepath = kwargs['tmp_weights_filepath']
        self.batch_size = kwargs['batch_size']
        self.num_trajectories = kwargs['num_trajectories']
        self.train_epochs = kwargs['train_epochs']
        self.train_steps = kwargs['train_steps']
        self.period = kwargs['period']
        self.output_dir = kwargs['output_dir']
        self.log_time = kwargs['log_time']
        self.num_gpus = kwargs['num_gpus']
        self.tpu_name = kwargs['tpu_name']
        self.wandb = kwargs['wandb']
        self.exp_buff = ExperienceBuffer(kwargs['buffer_size'])

        if self.tpu_name is not None:
            self.distribution_strategy = get_distribution_strategy(
                distribution_strategy='tpu', tpu_address=self.tpu_name)
            self.device = '/job:worker'
        else:
            self.distribution_strategy = get_distribution_strategy(
                distribution_strategy='default', num_gpus=self.num_gpus)
            self.device = None
        with tf.device(self.device), self.distribution_strategy.scope():
            self.model = DQN_Model(
                input_shape=self.input_shape,
                num_actions=self.num_actions,
                conv_layer_params=hyperparams['conv_layer_params'],
                fc_layer_params=hyperparams['fc_layer_params'],
                learning_rate=hyperparams['learning_rate'])
        gcs_load_weights(self.model, self.gcs_bucket, self.prefix,
                         self.tmp_weights_filepath)

    def fill_experience_buffer(self):
        """
        Method that fills the experience buffer object from CBT.

        Reads a batch of rows and parses through them until experience buffer reaches buffer_size.

        """
        self.exp_buff.reset()

        if self.log_time is True: self.time_logger.reset()

        #FETCH DATA
        global_i = cbt_global_iterator(self.cbt_table)
        rows = cbt_read_rows(self.cbt_table, self.prefix,
                             self.num_trajectories, global_i)

        if self.log_time is True: self.time_logger.log("Fetch Data      ")

        for row in tqdm(
                rows, "Parsing trajectories {} - {}".format(
                    global_i - self.num_trajectories, global_i - 1)):
            #DESERIALIZE DATA
            bytes_obs = row.cells['trajectory']['obs'.encode()][0].value
            bytes_actions = row.cells['trajectory'][
                'actions'.encode()][0].value
            bytes_rewards = row.cells['trajectory'][
                'rewards'.encode()][0].value

            if self.log_time is True: self.time_logger.log("Parse Bytes     ")

            #FORMAT DATA
            actions = np.frombuffer(bytes_actions,
                                    dtype=np.uint8).astype(np.int32)
            rewards = np.frombuffer(bytes_rewards, dtype=np.float32)
            num_steps = actions.size
            obs_shape = np.append(num_steps, self.input_shape).astype(np.int32)
            obs = np.frombuffer(bytes_obs, dtype=np.float32).reshape(obs_shape)

            if self.log_time is True: self.time_logger.log("Format Data     ")

            self.exp_buff.add_trajectory(obs, actions, rewards, num_steps)

            if self.log_time is True: self.time_logger.log("Add To Exp_Buff ")
        self.exp_buff.preprocess()

        dataset = tf.data.Dataset.from_tensor_slices(
            ((self.exp_buff.obs, self.exp_buff.next_obs),
             (self.exp_buff.actions, self.exp_buff.rewards,
              self.exp_buff.next_mask)))
        dataset = dataset.shuffle(self.exp_buff.max_size).repeat().batch(
            self.batch_size)

        dist_dataset = self.distribution_strategy.experimental_distribute_dataset(
            dataset)

        if self.log_time is True: self.time_logger.log("To Dataset      ")

        return dist_dataset

    def train(self):
        """
        Method that trains a model using using parameters defined in the constructor.

        """
        @tf.function
        def train_step(dist_inputs):
            def step_fn(inputs):
                ((b_obs, b_next_obs), (b_actions, b_rewards,
                                       b_next_mask)) = inputs

                with tf.GradientTape() as tape:
                    q_pred, q_next = self.model(b_obs), self.model(b_next_obs)
                    one_hot_actions = tf.one_hot(b_actions, self.num_actions)
                    q_pred = tf.reduce_sum(q_pred * one_hot_actions, axis=-1)
                    q_next = tf.reduce_max(q_next, axis=-1)
                    q_target = b_rewards + (
                        tf.constant(self.gamma, dtype=tf.float32) * q_next)
                    mse = self.model.loss(q_target, q_pred)
                    loss = tf.reduce_sum(mse)

                total_grads = tape.gradient(loss, self.model.trainable_weights)
                self.model.opt.apply_gradients(
                    list(zip(total_grads, self.model.trainable_weights)))
                return mse

            per_example_losses = self.distribution_strategy.experimental_run_v2(
                step_fn, args=(dist_inputs, ))
            mean_loss = self.distribution_strategy.reduce(
                tf.distribute.ReduceOp.MEAN, per_example_losses, axis=None)
            return mean_loss

        if self.log_time is True:
            self.time_logger = TimeLogger([
                "Fetch Data      ", "Parse Bytes     ", "Format Data     ",
                "Add To Exp_Buff ", "To Dataset      ", "Train Step      ",
                "Save Model      "
            ])
        print("-> Starting training...")
        for epoch in range(self.train_epochs):
            with tf.device(self.device), self.distribution_strategy.scope():
                dataset = self.fill_experience_buffer()
                exp_buff = iter(dataset)

                losses = []
                for step in tqdm(range(self.train_steps),
                                 "Training epoch {}".format(epoch)):
                    loss = train_step(next(exp_buff))
                    losses.append(loss)

                    if self.log_time is True:
                        self.time_logger.log("Train Step      ")

                if self.wandb is not None:
                    mean_loss = np.mean(losses)
                    tf.summary.scalar("Mean Loss", mean_loss, epoch)
                    self.wandb.log({"Epoch": epoch, "Mean Loss": mean_loss})

            if epoch > 0 and epoch % self.period == 0:
                model_filename = self.prefix + '_model.h5'
                gcs_save_weights(self.model, self.gcs_bucket,
                                 self.tmp_weights_filepath, model_filename)

            if self.log_time is True: self.time_logger.log("Save Model      ")

            if self.log_time is True: self.time_logger.print_totaltime_logs()
        print("-> Done!")