コード例 #1
0
    def train_step(batch_images, batch_labels):
        with tf.GradientTape() as tape:
            pred = centernet(batch_images, training=True)
            loss_value = post_process.training_procedure(
                batch_labels=batch_labels, pred=pred)
        gradients = tape.gradient(target=loss_value,
                                  sources=centernet.trainable_variables)
        optimizer.apply_gradients(
            grads_and_vars=zip(gradients, centernet.trainable_variables))
        loss_metric.update_state(values=loss_value)

    for epoch in range(load_weights_from_epoch + 1, Config.epochs):
        for step, batch_data in enumerate(train_data):
            step_start_time = time.time()
            images, labels = data_loader.read_batch_data(batch_data)
            train_step(images, labels)
            step_end_time = time.time()
            print("Epoch: {}/{}, step: {}/{}, loss: {}, time_cost: {:.3f}s".
                  format(epoch, Config.epochs, step, steps_per_epoch,
                         loss_metric.result(),
                         step_end_time - step_start_time))
        loss_metric.reset_states()

        if epoch % Config.save_frequency == 0:
            centernet.save_weights(filepath=Config.save_model_dir +
                                   "epoch-{}".format(epoch),
                                   save_format="tf")

        if Config.test_images_during_training:
            visualize_training_results(pictures=Config.test_images_dir_list,
コード例 #2
0
ファイル: train.py プロジェクト: youyouf/Tensorflow2.x
def train():
    #开启GPU
    gpus = tf.config.experimental.list_physical_devices("GPU")
    if gpus:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    # 读取数据
    dataloader = DetectionDataset()
    train_data, train_size = dataloader.generate_datatset(
        mode="train")  #获取txt文件中的string数据,返回的是batch_size的string,以及txt的大小
    train_steps_per_epoch = tf.math.ceil(train_size / Config.batch_size)
    #验证集
    val_data, val_size = dataloader.generate_datatset(mode="val")
    val_steps_per_epoch = tf.math.ceil(val_size / Config.batch_size)
    data_loader = DataLoader()  #创建一个class的大小数据加载

    if os.path.exists(Config.log_dir):
        # 清除summary目录下原有的东西
        shutil.rmtree(Config.log_dir)

    # 建立模型保存目录
    if not os.path.exists(os.path.split(Config.save_model_path)[0]):
        os.mkdir(os.path.split(Config.save_model_path)[0])

    print(
        'Total on {}, train on {} samples, val on {} samples with batch size {}.'
        .format((train_size + val_size), train_size, val_size,
                Config.batch_size))
    # optimizer
    lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
        initial_learning_rate=1e-4,
        decay_steps=train_steps_per_epoch * Config.learning_rate_decay_epochs,
        decay_rate=0.96)
    optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)

    # 创建模型结构
    centernet = CenterNet()
    print_model_summary(centernet)
    try:
        centernet.load_weights(filepath=Config.save_model_path)
        print("Load weights...")
    except:
        print("load weights...")

    # 定义模型评估指标
    train_loss = tf.metrics.Mean(name='train_loss')
    valid_loss = tf.metrics.Mean(name='valid_loss')

    post_process = PostProcessing()
    #设置保存最好模型的指标
    best_test_loss = float('inf')

    # 创建summary
    summary_writer = tf.summary.create_file_writer(logdir=Config.log_dir)

    #训练
    for epoch in range(1, Config.epochs + 1):
        train_loss.reset_states()
        valid_loss.reset_states()

        #处理训练集数据
        for step, batch_data in enumerate(train_data):
            step_start_time = time.time()
            images, labels = data_loader.read_batch_data(
                batch_data
            )  # 返回的是图片image,以及标签信息[batch, max_boxes_per_image, xmin, ymin, xmax, ymax, class_id]
            with tf.GradientTape() as tape:
                # 得到预测
                pred = centernet(images, training=True)
                # 计算损失
                loss_value = post_process.training_procedure(
                    batch_labels=labels, pred=pred)

            # 反向传播梯度下降
            # model.trainable_variables代表把loss反向传播到每个可以训练的变量中
            gradients = tape.gradient(target=loss_value,
                                      sources=centernet.trainable_variables)
            # 将每个节点的误差梯度gradients,用于更新该节点的可训练变量值
            # zip是把梯度和可训练变量值打包成元组
            optimizer.apply_gradients(
                grads_and_vars=zip(gradients, centernet.trainable_variables))

            # 更新train_loss
            train_loss.update_state(values=loss_value)

            step_end_time = time.time()
            print("Epoch: {}/{}, step: {}/{}, loss: {}, time_cost: {:.3f}s".
                  format(epoch, Config.epochs, step, train_steps_per_epoch,
                         train_loss.result(), step_end_time - step_start_time))

            with summary_writer.as_default():
                tf.summary.scalar(
                    "steps_perbatch_train_loss",
                    train_loss.result(),
                    step=tf.cast(((epoch - 1) * train_steps_per_epoch + step),
                                 tf.int64))

        # 计算验证集
        for step, batch_data in enumerate(val_data):
            step_start_time = time.time()
            images, labels = data_loader.read_batch_data(
                batch_data
            )  # 返回的是图片image,以及标签信息[batch, max_boxes_per_image, xmin, ymin, xmax, ymax, class_id]
            # 得到预测,不training
            pred = centernet(images)
            # 计算损失
            loss_value = post_process.training_procedure(batch_labels=labels,
                                                         pred=pred)

            # 更新valid_loss
            valid_loss.update_state(loss_value)
            step_end_time = time.time()
            print(
                "--------Epoch: {}/{}, step: {}/{}, loss: {}, time_cost: {:.3f}s"
                .format(epoch, Config.epochs, step, val_steps_per_epoch,
                        valid_loss.result(), step_end_time - step_start_time))
            with summary_writer.as_default():
                tf.summary.scalar("steps_perbatch_val_loss",
                                  valid_loss.result(),
                                  step=tf.cast(
                                      (epoch - 1) * val_steps_per_epoch + step,
                                      tf.int64))

        # 保存到tensorboard里
        with summary_writer.as_default():
            tf.summary.scalar("train_loss",
                              train_loss.result(),
                              step=optimizer.iterations)
            tf.summary.scalar('valid_loss',
                              valid_loss.result(),
                              step=optimizer.iterations)

        # 只保存最好模型
        if valid_loss.result() < best_test_loss:
            best_test_loss = valid_loss.result()
            centernet.save_weights(Config.save_model_path, save_format="tf")
            print("Update model's weights")