Esempio n. 1
0
def main():
    config = argparse.parse_cmd(arguments)
    datasets = {
        'train':
        dataloader.get_dataset(
            config['dataloader'],
            transformation_list=config['dataloader']['train_list'],
            num_classes=10,
            split=config['dataloader']['train_split_id']),
        'val':
        dataloader.get_dataset(
            config['dataloader'],
            transformation_list=config['dataloader']['val_list'],
            num_classes=10,
            split=config['dataloader']['val_split_id'])
    }

    for dataset_type in ['train', 'val']:
        for i, (images, y) in enumerate(datasets[dataset_type]):
            image = images[0]
            # opencv manage images as BGR object, TF as RGB
            image = image.numpy()[:, :, ::-1]
            cv2.imwrite(os.path.join('/tmp', f'{dataset_type}_{i}.png'), image)
            if i == 20:
                break
    def test_compare_without_augmentations(self):
        # create fake dataset and convert TFrecords
        dataset_dir = create_fake_dataset_and_convert_to_tfrecords(
            image_array=self.image_white, n_images_per_class=10)

        config = {
            'name': 'tfrecords',
            'data_dir': dataset_dir,
            'batch_size': BATCH_SIZE,
            'train_split_id': 'train',
        }

        # get data from tfrecord
        dataset = get_dataset(config, [], 1, 'train')

        image, _ = next(iter(dataset))
        # Normalize the image similar to DCN strategy
        normalize = image / 255.
        subtract_mean = tf.reduce_mean(normalize, axis=0)
        x_train_from_our_dataloader = normalize - subtract_mean

        x_train_from_dcn_dataloader = self.get_dcn_dataloader(self.image_white)

        # compare output between our Dataloader and DCN dataloader
        self.assertEqual(
            np.allclose(x_train_from_our_dataloader,
                        x_train_from_dcn_dataloader), True)

        # clean up
        shutil.rmtree(dataset_dir)
def evaluate_dataset(args, model):
    print(f"Evaluating on {args['dataloader']['name']}")
    args['dataloader']['train_split_id'] = None
    dataset = dataloader.get_dataset(
        args['dataloader'],
        transformation_list=args['dataloader']['val_list'],
        num_classes=args["num_classes"],
        split=args['dataloader']['val_split_id'])
    model.evaluate(dataset)
    def test_compare_with_augmentation_translate(self):
        # create fake dataset
        dataset_dir = create_fake_dataset_and_convert_to_tfrecords(
            image_array=self.batch_random_images, n_images_per_class=10)

        config = {
            'name': 'tfrecords',
            'data_dir': dataset_dir,
            'batch_size': BATCH_SIZE,
            'train_split_id': 'train',
            'Translate': {
                'width_shift_range': 0.125,
                'height_shift_range': 0.125,
                'padding_strategy': 'REFLECT'
            }
        }

        dataset = get_dataset(config, ['Translate'], 1, 'train')

        image, _ = next(iter(dataset))
        # Normalize the image similar to DCN strategy
        normalize = image / 255.
        subtract_mean = tf.reduce_mean(normalize, axis=0)
        x_train_from_our_dataloader = normalize - subtract_mean

        # get x_train from dcn
        x_data = self.get_dcn_dataloader(self.batch_random_images)

        # DCN augmentation strategy
        dcn_augmentations = keras_preprocessing.ImageDataGenerator(
            height_shift_range=config['Translate']['height_shift_range'],
            width_shift_range=config['Translate']['width_shift_range'],
            horizontal_flip=False)

        x_train_from_dcn_dataloader = next(
            iter(
                dcn_augmentations.flow(x_data,
                                       None,
                                       batch_size=BATCH_SIZE,
                                       shuffle=False)))

        # Test the tensor shape remains the same
        self.assertTrue(x_train_from_dcn_dataloader.shape,
                        x_train_from_dcn_dataloader.shape)

        # Note for translate we expect the mean to be different as the method to fill the missing pixels that keras and our data loader use are different.
        # This doesn't cause an issue with the image being translated (see test_data_visualize), rather difference in pixel values at the shifted places.
        # data visualization does confirm the translation works as expected.
        self.assertAlmostEqual(np.mean(x_train_from_our_dataloader),
                               np.mean(x_train_from_dcn_dataloader),
                               places=3)

        # clean up
        shutil.rmtree(dataset_dir)
def train(args):
    print(args)
    global_conf.config_tf2(args)
    checkpoint_dir, log_dir, export_dir = create_env_directories(
        args, get_experiment_name(args))

    train_dataset = dataloader.get_dataset(
        args['dataloader'],
        transformation_list=args['dataloader']['train_list'],
        num_classes=args["num_classes"],
        split=args['dataloader']['train_split_id'])
    val_dataset = dataloader.get_dataset(
        args['dataloader'],
        transformation_list=args['dataloader']['val_list'],
        num_classes=args["num_classes"],
        split=args['dataloader']['val_split_id'])

    setup_mp(args)
    build_model_fn = get_model(args)
    callbacks = get_callbacks(args, log_dir)

    # tuner = Hyperband(build_model_fn,
    #                   objective='val_accuracy',
    #                   max_epochs=args['num_epochs'],
    #                   hyperband_iterations=10e100,
    #                   directory=checkpoint_dir)

    tuner = BayesianOptimization(build_model_fn,
                                 objective='val_accuracy',
                                 max_trials=100000,
                                 num_initial_points=10,
                                 directory=checkpoint_dir)

    tuner.search_space_summary()
    tuner.search(x=train_dataset,
                 validation_data=val_dataset,
                 callbacks=callbacks,
                 epochs=args['num_epochs'])
    tuner.results_summary()
Esempio n. 6
0
def main():
    args = argparse.parse_cmd(arguments)
    datasets = {
        'train':
        dataloader.get_dataset(
            args['dataloader'],
            transformation_list=args['dataloader']['train_list'],
            num_classes=10,
            split='train'),
        'val':
        dataloader.get_dataset(
            args['dataloader'],
            transformation_list=args['dataloader']['val_list'],
            num_classes=10,
            split='test')
    }
    for dataset_type in ['train', 'val']:
        for i, (images, y) in enumerate(datasets[dataset_type]):
            image = images[0]
            cv2.imwrite(os.path.join('/tmp', f'{dataset_type}_{i}.png'),
                        image.numpy())
            if i == 20:
                break
Esempio n. 7
0
def train(args):
    print(args)
    global_conf.config_tf2(args)
    checkpoint_dir, log_dir, export_dir = create_env_directories(
        args, get_experiment_name(args))

    train_dataset = dataloader.get_dataset(
        args['dataloader'],
        transformation_list=args['dataloader']['train_list'],
        num_classes=args["num_classes"],
        split=args['dataloader']['train_split_id'])
    val_dataset = dataloader.get_dataset(
        args['dataloader'],
        transformation_list=args['dataloader']['val_list'],
        num_classes=args["num_classes"],
        split=args['dataloader']['val_split_id'])

    setup_mp(args)
    model, _ = define_model_in_strategy(args, get_model)
    alchemy_api.send_model_info(model, args['server'])
    callbacks = get_callbacks(args, log_dir)

    model_checkpoint_cb, latest_epoch = init_custom_checkpoint_callbacks(
        {'model': model}, checkpoint_dir)
    callbacks.append(model_checkpoint_cb)
    if args['server']['id'] != '':
        callbacks.append(alchemy_api.send_metric_callbacks(args['server']))
    model.fit(x=train_dataset,
              validation_data=val_dataset,
              epochs=args['num_epochs'],
              callbacks=callbacks,
              max_queue_size=16,
              initial_epoch=latest_epoch)
    print("export model")
    export.export(model, export_dir, args)
    print("Training Completed!!")
Esempio n. 8
0
def train(args):
  print(args)
  global_conf.config_tf2(args)
  checkpoint_dir, log_dir, export_dir = create_env_directories(args, get_experiment_name(args))

  train_dataset = dataloader.get_dataset(args['dataloader'], transformation_list=args['dataloader']['train_list'],
                                         num_classes=args["num_classes"], split=args['dataloader']['train_split_id'])
  val_dataset = dataloader.get_dataset(args['dataloader'], transformation_list=args['dataloader']['val_list'],
                                       num_classes=args["num_classes"], split=args['dataloader']['val_split_id'])

  setup_mp(args)
  model, _ = define_model_in_strategy(args, get_model)
  alchemy_api.send_model_info(model, args['server'])
  callbacks = get_callbacks(args, log_dir)

  model_checkpoint_cb, latest_epoch = init_custom_checkpoint_callbacks({'model': model}, checkpoint_dir)
  callbacks.append(model_checkpoint_cb)
  if args['server']['id'] != '':
    callbacks.append(alchemy_api.send_metric_callbacks(args['server']))
  if args['model_name'] == 'Pdart':
    from src.models.pdart import callback_epoch
    callbacks.append(tf.keras.callbacks.LambdaCallback(on_epoch_begin=lambda epoch, logs: callback_epoch(epoch, args['num_epochs'], args['drop_path_prob'])))
  # Use weight and biases only use_wandb is true and framework is tensorflow
  if args['wandb_params']['use_wandb'] and 'tensorflow' in args['framework']:
    from wandb.keras import WandbCallback
    callbacks.append(WandbCallback())
  model.fit(x=train_dataset,
            validation_data=val_dataset,
            epochs=args['num_epochs'],
            callbacks=callbacks,
            max_queue_size=16,
            initial_epoch=latest_epoch
            )
  print("export model")
  export.export(model, export_dir, args)
  print("Training Completed!!")
    def test_compare_without_augmentations_random(self):
        # create fake dataset and convert TFrecords
        dataset_dir = create_fake_dataset_and_convert_to_tfrecords(
            image_array=self.batch_random_images, n_images_per_class=10)

        # Code below tested with subtracting the mean for each color channels
        # and there were differences between the output from our dataloader and DCN dataloader.
        # For comparison purposes the mean is subtracted following DCN strategy.

        # image_normalized = self.batch_random_images,/ 255.
        # r = np.dstack([image_normalized[i][:, :, 0] for i in range(len(image_normalized))])
        # g = np.dstack([image_normalized[i][:, :, 1] for i in range(len(image_normalized))])
        # b = np.dstack([image_normalized[i][:, :, 2] for i in range(len(image_normalized))])
        # mean_data = [np.mean(r), np.mean(g), np.mean(b)]
        # print(mean_data)

        config = {
            'name': 'tfrecords',
            'data_dir': dataset_dir,
            'batch_size': BATCH_SIZE,
            'train_split_id': 'train',
        }

        # get data from tfrecord
        dataset = get_dataset(config, [], 1, 'train')

        image, _ = next(iter(dataset))
        # Normalize the image similar to DCN strategy
        normalize = image / 255.
        subtract_mean = tf.reduce_mean(normalize, axis=0)
        x_train_from_our_dataloader = normalize - subtract_mean

        # get x_train from dcn
        x_train_from_dcn_dataloader = self.get_dcn_dataloader(
            self.batch_random_images)

        self.assertAlmostEqual(np.mean(x_train_from_our_dataloader),
                               np.mean(x_train_from_dcn_dataloader),
                               places=5)

        # clean up
        shutil.rmtree(dataset_dir)
def get_dataset(args):
  args['dataloader']['train_split_id'] = None
  dataset = dataloader.get_dataset(args['dataloader'], transformation_list=args['dataloader']['list'],
                                     num_classes=args["num_classes"], split=args['dataloader']['split_id'])
  return dataset
Esempio n. 11
0
def train(config):
  """
  This function setup:
    1- Tensorflow (XLA, GPU configuration, mixed precision, execution strategies)
    2- The datasets
    3- The model
    4- The execution environment
    5- The monitoring (Upstride plateform and tensorboard)

  Then 
    6- start the training
    7- Export the model
  """

  # 1
  global_conf.config_tf2(config['config'])
  global_conf.setup_mp(config['config'])
  ds_strategy = global_conf.setup_strategy(config['config']['strategy'])
  if config['model']['channels_first']:  # if True set keras backend to channels_first
    tf.keras.backend.set_image_data_format('channels_first')

  # 2
  train_dataset = dataloader.get_dataset(config['dataloader'], transformation_list=config['dataloader']['train_list'],
                                         num_classes=config['model']["num_classes"], split=config['dataloader']['train_split_id'])
  val_dataset = dataloader.get_dataset(config['dataloader'], transformation_list=config['dataloader']['val_list'],
                                       num_classes=config['model']["num_classes"], split=config['dataloader']['val_split_id'])

  # 3
  with ds_strategy.scope():
    model, optimizer = get_compiled_model(config)

  # 4
  checkpoint_dir, log_dir, export_dir = create_env_directories(get_experiment_name(config), config['checkpoint_dir'], config['log_dir'], config['export']['dir'])
  if not os.path.exists(log_dir):
    os.makedirs(log_dir)
  with open(os.path.join(log_dir, "conf.yml"), 'w') as file:
    yaml.dump(config, file)

  # 5
  config['server'] = alchemy_api.start_training(config['server'])
  alchemy_api.send_model_info(model, config['server'])
  callbacks = get_callbacks(config, log_dir)

  with ds_strategy.scope(): # checkpoints needs to be in the same scope.
    model_checkpoint_cb, latest_epoch = init_custom_checkpoint_callbacks({'model': model}, checkpoint_dir, config['max_checkpoints'], config['checkpoint_freq'])

  callbacks.append(model_checkpoint_cb)
  if config['server']['id'] != '':
    callbacks.append(alchemy_api.send_metric_callbacks(config['server']))

  if config['model']['name'] == 'Pdart':
    from src.models.pdart import callback_epoch
    callbacks.append(tf.keras.callbacks.LambdaCallback(on_epoch_begin=lambda epoch, logs: callback_epoch(epoch, config['num_epochs'], config['drop_path_prob'])))

  # 6 training
  if config['progressive_resizing']:
    progressive_training(model=model,
                         config=config,
                         train_dataset=train_dataset,
                         val_dataset=val_dataset,
                         callbacks=callbacks,
                         latest_epoch=latest_epoch,
                         max_queue_size=16,
                         optimizer=optimizer)
  else:
    model.fit(x=train_dataset,
              validation_data=val_dataset,
              epochs=config['num_epochs'],
              callbacks=callbacks,
              max_queue_size=16,
              initial_epoch=latest_epoch
              )


  # 7 training
  print("export model")
  export.export(model, export_dir, config)
  print("Training Completed!!")
    def test_data_visualization(self):
        dataset_dir = tempfile.mkdtemp()
        os.makedirs(os.path.join(dataset_dir, 'cat'), exist_ok=True)
        shutil.copyfile(os.path.join(self.image_path),
                        os.path.join(dataset_dir, 'cat', 'cat.jpeg'))

        image = keras_preprocessing.load_img(self.image_path,
                                             grayscale=False,
                                             color_mode='rgb',
                                             target_size=None,
                                             interpolation='nearest')
        image_array = keras_preprocessing.img_to_array(image)
        image_array = image_array[
            np.newaxis, ...]  # adding batch dimension for ImageDataLoader

        args = {
            'name': 'tfrecords',
            'description': 'test',
            'tfrecord_dir_path': dataset_dir,
            'tfrecord_size': 1,
            'preprocessing': 'NO',
            'image_size': DIMENSIONS[1:3],
            "n_tfrecords": 1,
            'data': {
                'images_dir_path': dataset_dir,
                'annotation_file_path': None,
                'delimiter': ',',
                'header_exists': False,
                'split_names': ['train'],
                'split_percentages': [1.0],
            }
        }
        # generate tfrecords
        build_tfrecord_dataset(args)

        config = {
            'name': 'tfrecords',
            'data_dir': dataset_dir,
            'batch_size': 1,
            'train_split_id': 'train',
            'Translate': {
                'width_shift_range': 0.125,
                'height_shift_range': 0.125,
                'padding_strategy': 'reflect'
            }
        }

        # loop through until the RandomHorizontalFlip is applied.
        dataset = get_dataset(config, ['Translate', 'RandomHorizontalFlip'], 1,
                              'train')
        image, _ = next(iter(dataset))  # get the image
        image_i = keras_preprocessing.array_to_img(
            image[0])  # get the image excluding the batch dimension and save
        keras_preprocessing.save_img(
            os.path.join(dataset_dir, 'cat_after_augment_ours.png'),
            image_i)  # save the image
        self.assertFalse(np.allclose(image, image_array),
                         "Augmented Image and original image are the same")

        dcn_augmentations = keras_preprocessing.ImageDataGenerator(
            height_shift_range=config['Translate']['height_shift_range'],
            width_shift_range=config['Translate']['width_shift_range'],
            horizontal_flip=True)

        image_dcn = next(
            iter(
                dcn_augmentations.flow(image_array,
                                       None,
                                       batch_size=1,
                                       shuffle=False)))
        # save image excluding batch size
        keras_preprocessing.save_img(
            os.path.join(dataset_dir, 'cat_after_augment_DCN.png'), image[0])
        self.assertFalse(np.allclose(image_dcn, image_array),
                         "Augmented Image and original image are the same")

        # remove the below line in order to perform the visual inspection
        shutil.rmtree(dataset_dir)
    def test_compare_with_augmentation_randomflip(self):
        # create fake dataset
        # dataset_dir = create_fake_dataset_and_convert_to_tfrecords(image_array=self.black_white_image, n_images_per_class=1)
        dataset_dir = tempfile.mkdtemp()
        os.makedirs(os.path.join(dataset_dir, 'black_white'), exist_ok=True)
        shutil.copyfile(
            os.path.join(self.black_white_image),
            os.path.join(dataset_dir, 'black_white', 'black_and_white.jpeg'))

        image = keras_preprocessing.load_img(self.black_white_image,
                                             grayscale=False,
                                             color_mode='rgb',
                                             target_size=None,
                                             interpolation='nearest')
        image_array = keras_preprocessing.img_to_array(image)
        image_array = image_array[
            np.newaxis, ...]  # adding batch dimension for ImageDataLoader

        args = {
            'name': 'tfrecords',
            'description': 'test',
            'tfrecord_dir_path': dataset_dir,
            'tfrecord_size': 1,
            'preprocessing': 'NO',
            'image_size': DIMENSIONS[1:3],
            "n_tfrecords": 1,
            'data': {
                'images_dir_path': dataset_dir,
                'annotation_file_path': None,
                'delimiter': ',',
                'header_exists': False,
                'split_names': ['train'],
                'split_percentages': [1.0],
            }
        }
        # generate tfrecords
        build_tfrecord_dataset(args)

        config = {
            'name': 'tfrecords',
            'data_dir': dataset_dir,
            'batch_size': 1,
            'train_split_id': 'train',
        }

        # loop through until the RandomHorizontalFlip is applied
        dataset = get_dataset(config, ['RandomHorizontalFlip'], 1, 'train')
        x_train_from_our_dataloader, _ = next(iter(dataset))
        self.assertFalse(np.allclose(x_train_from_our_dataloader, image_array),
                         "Augmented Image and original image are the same")

        # loop through until the RandomHorizontalFlip is applied
        dcn_augmentations = keras_preprocessing.ImageDataGenerator(
            horizontal_flip=True)
        x_train_from_dcn_dataloader = next(
            iter(
                dcn_augmentations.flow(image_array,
                                       None,
                                       batch_size=1,
                                       shuffle=False)))
        self.assertFalse(np.allclose(x_train_from_dcn_dataloader, image_array),
                         "Augmented Image and original image are the same")

        # Test the tensor shape remains the same
        self.assertTrue(x_train_from_our_dataloader.shape,
                        x_train_from_dcn_dataloader.shape)

        # Note: The example used here is to test if the augmentation works using a black and white image.
        # This test would fail if an image with wide range of pixel values across the color channels are used.
        # This is a limitation due to the loss of data (encode to bytes and decode back to image) for JPEG format.
        # In the TFrecord writer we use opencv and decode using tensorflow library. The encoded bytes using openCV and tensorflow are not identical.
        # There is a slight pixel difference which is not noticible to the human eyes.
        self.assertAlmostEqual(np.mean(x_train_from_our_dataloader),
                               np.mean(x_train_from_dcn_dataloader),
                               places=7)

        # clean up
        shutil.rmtree(dataset_dir)
Esempio n. 14
0
def train(args):
    # config_tf2(args['configuration']['xla'])
    # Create log, checkpoint and export directories
    checkpoint_dir, log_dir, export_dir = create_env_directories(
        args, get_experiment_name(args))
    train_log_dir = os.path.join(log_dir, 'train')
    val_log_dir = os.path.join(log_dir, 'validation')
    arch_log_dir = os.path.join(log_dir, 'arch')
    summary_writers = {
        'train': tf.summary.create_file_writer(train_log_dir),
        'val': tf.summary.create_file_writer(val_log_dir),
        'arch': tf.summary.create_file_writer(arch_log_dir)
    }

    # Prepare the 3 datasets
    train_weight_dataset = dataloader.get_dataset(
        args['dataloader'],
        transformation_list=args['dataloader']['train_list'],
        num_classes=args["num_classes"],
        split='train_weights')
    train_arch_dataset = dataloader.get_dataset(
        args['dataloader'],
        transformation_list=args['dataloader']['train_list'],
        num_classes=args["num_classes"],
        split='train_arch')
    val_dataset = dataloader.get_dataset(
        args['dataloader'],
        transformation_list=args['dataloader']['val_list'],
        num_classes=args["num_classes"],
        split='test')

    # define model, optimizer and checkpoint callback
    setup_mp(args)
    model = model_name_to_class[args['model_name']](
        args['framework'],
        input_shape=args['input_size'],
        label_dim=args['num_classes']).model
    model.summary()

    alchemy_api.send_model_info(model, args['server'])
    weights, arch_params = fbnetv2.split_trainable_weights(model)
    weight_opt = get_optimizer(args['optimizer'])
    arch_opt = get_optimizer(args['arch_search']['optimizer'])
    model_checkpoint_cb, latest_epoch = init_custom_checkpoint_callbacks(
        {'model': model}, checkpoint_dir, args['max_checkpoints'],
        args['checkpoint_freq'])
    callbacks = [model_checkpoint_cb]

    temperature_decay_fn = fbnetv2.exponential_decay(
        args['arch_search']['temperature']['init_value'],
        args['arch_search']['temperature']['decay_steps'],
        args['arch_search']['temperature']['decay_rate'])

    lr_decay_fn = CosineDecay(
        args['optimizer']['lr'],
        alpha=args["optimizer"]["lr_decay_strategy"]["lr_params"]["alpha"],
        total_epochs=args['num_epochs'])

    lr_decay_fn_arch = CosineDecay(args['arch_search']['optimizer']['lr'],
                                   alpha=0.000001,
                                   total_epochs=args['num_epochs'])

    metrics = {
        'arch': {
            'latency_reg_loss': tf.keras.metrics.Mean()
        },
        'train': {
            'total_loss': tf.keras.metrics.Mean(),
            'accuracy': tf.keras.metrics.CategoricalAccuracy(),
            'cross_entropy_loss': tf.keras.metrics.Mean(),
        },
        'val': {
            'accuracy': tf.keras.metrics.CategoricalAccuracy(),
            'cross_entropy_loss': tf.keras.metrics.Mean(),
        }
    }

    train_step = get_train_step_function(model, weights, weight_opt,
                                         metrics['train'])
    train_step_arch = get_train_step_arch_function(model, arch_params,
                                                   arch_opt, metrics['train'],
                                                   metrics['arch'])
    evaluation_step = get_eval_step_function(model, metrics['val'])

    for epoch in range(latest_epoch, args['num_epochs']):
        print(f'Epoch: {epoch}/{args["num_epochs"]}')
        # Update both LR
        weight_opt.learning_rate = lr_decay_fn(epoch)
        arch_opt.learning_rate = lr_decay_fn_arch(epoch)
        # Updating the weight parameters using a subset of the training data
        for step, (x_batch, y_batch) in tqdm.tqdm(
                enumerate(train_weight_dataset, start=1)):
            train_step(x_batch, y_batch)
        # Evaluate the model on validation subset
        for x_batch, y_batch in val_dataset:
            evaluation_step(x_batch, y_batch)
        # Handle metrics
        template = f"Weights updated, Epoch {epoch}"
        template = metrics_processing(metrics, summary_writers,
                                      ['train', 'val'], template, epoch)
        template += f", lr: {float(weight_opt.learning_rate)}"
        print(template)

        new_temperature = temperature_decay_fn(epoch)
        with summary_writers['train'].as_default():
            tf.summary.scalar('temperature', new_temperature, step=epoch)
        define_temperature(new_temperature)

        if epoch >= args['arch_search']['num_warmup']:
            # Updating the architectural parameters on another subset
            for step, (x_batch, y_batch) in tqdm.tqdm(
                    enumerate(train_arch_dataset, start=1)):
                train_step_arch(x_batch, y_batch)
            # Evaluate the model on validation subset
            for x_batch, y_batch in val_dataset:
                evaluation_step(x_batch, y_batch)
            # Handle metrics
            template = f'Architecture updated, Epoch {epoch}'
            template = metrics_processing(metrics,
                                          summary_writers,
                                          ['train', 'val', 'arch'],
                                          template,
                                          epoch,
                                          postfix='_arch')
            template += f", lr: {float(arch_opt.learning_rate)}"
            print(template)
        # move saved outside of condition so we save starting from the begining
        fbnetv2.save_arch_params(model, epoch, log_dir)

        # manually call the callbacks
        for callback in callbacks:
            callback.on_epoch_end(epoch, logs=None)

    print("Training Completed!!")

    print("Architecture params: ")
    print(arch_params)
    fbnetv2.post_training_analysis(
        model, args['arch_search']['exported_architecture'])
Esempio n. 15
0
def train(args):
    # config_tf2(args['configuration']['xla'])
    # Create log, checkpoint and export directories
    checkpoint_dir, log_dir, export_dir = create_env_directories(
        args, get_experiment_name(args))

    train_weight_dataset = dataloader.get_dataset(
        args['dataloader'],
        transformation_list=args['dataloader']['train_list'],
        num_classes=args["num_classes"],
        split='train_weights')
    train_arch_dataset = dataloader.get_dataset(
        args['dataloader'],
        transformation_list=args['dataloader']['train_list'],
        num_classes=args["num_classes"],
        split='train_arch')
    val_dataset = dataloader.get_dataset(
        args['dataloader'],
        transformation_list=args['dataloader']['val_list'],
        num_classes=args["num_classes"],
        split='validation')

    setup_mp(args)

    # define model, optimizer and checkpoint callback
    model = model_name_to_class[args['model_name']](
        args['framework'],
        input_shape=args['input_size'],
        label_dim=args['num_classes']).model
    model.summary()
    alchemy_api.send_model_info(model, args['server'])
    weight_opt = get_optimizer(args['optimizer'])
    arch_opt = get_optimizer(args['arch_optimizer_param'])
    model_checkpoint_cb, latest_epoch = init_custom_checkpoint_callbacks(
        {'model': model}, checkpoint_dir)

    weights, arch_params = split_trainable_weights(model)
    temperature_decay_fn = exponential_decay(
        args['temperature']['init_value'], args['temperature']['decay_steps'],
        args['temperature']['decay_rate'])

    lr_decay_fn = CosineDecay(
        args['optimizer']['lr'],
        alpha=args["optimizer"]["lr_decay_strategy"]["lr_params"]["alpha"],
        total_epochs=args['num_epochs'])

    loss_fn = CategoricalCrossentropy()
    accuracy_metric = CategoricalAccuracy()
    loss_metric = Mean()
    val_accuracy_metric = CategoricalAccuracy()
    val_loss_metric = Mean()

    train_log_dir = os.path.join(args['log_dir'], 'train')
    val_log_dir = os.path.join(args['log_dir'], 'validation')
    train_summary_writer = tf.summary.create_file_writer(train_log_dir)
    val_summary_writer = tf.summary.create_file_writer(val_log_dir)

    @tf.function
    def train_step(x_batch, y_batch):
        with tf.GradientTape() as tape:
            y_hat = model(x_batch, training=True)
            loss = loss_fn(y_batch, y_hat)

        accuracy_metric.update_state(y_batch, y_hat)
        loss_metric.update_state(loss)
        grads = tape.gradient(loss, weights)
        weight_opt.apply_gradients(zip(grads, weights))

    @tf.function
    def train_step_arch(x_batch, y_batch):
        with tf.GradientTape() as tape:
            y_hat = model(x_batch, training=False)
            loss = loss_fn(y_batch, y_hat)

        accuracy_metric.update_state(y_batch, y_hat)
        loss_metric.update_state(loss)
        grads = tape.gradient(loss, arch_params)
        arch_opt.apply_gradients(zip(grads, arch_params))

    @tf.function
    def evaluation_step(x_batch, y_batch):
        y_hat = model(x_batch, training=False)
        loss = loss_fn(y_batch, y_hat)

        val_accuracy_metric.update_state(y_batch, y_hat)
        val_loss_metric.update_state(loss)

    for epoch in range(latest_epoch, args['num_epochs']):
        print(f'Epoch: {epoch}/{args["num_epochs"]}')

        weight_opt.learning_rate = lr_decay_fn(epoch)

        # Updating the weight parameters using a subset of the training data
        for step, (x_batch, y_batch) in tqdm.tqdm(
                enumerate(train_weight_dataset, start=1)):
            train_step(x_batch, y_batch)

        # Evaluate the model on validation subset
        for x_batch, y_batch in val_dataset:
            evaluation_step(x_batch, y_batch)

        train_accuracy = accuracy_metric.result()
        train_loss = loss_metric.result()
        val_accuracy = val_accuracy_metric.result()
        val_loss = val_loss_metric.result()

        template = f'Weights updated, Epoch {epoch}, Train Loss: {float(train_loss)}, Train Accuracy: ' \
            f'{float(train_accuracy)}, Val Loss: {float(val_loss)}, Val Accuracy: {float(val_accuracy)}, ' \
            f'lr: {float(weight_opt.learning_rate)}'
        print(template)

        new_temperature = temperature_decay_fn(epoch)

        with train_summary_writer.as_default():
            tf.summary.scalar('loss', train_loss, step=epoch)
            tf.summary.scalar('accuracy', train_accuracy, step=epoch)
            tf.summary.scalar('temperature', new_temperature, step=epoch)

        with val_summary_writer.as_default():
            tf.summary.scalar('loss', val_loss, step=epoch)
            tf.summary.scalar('accuracy', val_accuracy, step=epoch)

        # Resetting metrices for reuse
        accuracy_metric.reset_states()
        loss_metric.reset_states()
        val_accuracy_metric.reset_states()
        val_loss_metric.reset_states()

        if epoch >= 10:
            # Updating the architectural parameters on another subset
            for step, (x_batch, y_batch) in tqdm.tqdm(
                    enumerate(train_arch_dataset, start=1)):
                train_step_arch(x_batch, y_batch)

            # Evaluate the model on validation subset
            for x_batch, y_batch in val_dataset:
                evaluation_step(x_batch, y_batch)

            train_accuracy = accuracy_metric.result()
            train_loss = loss_metric.result()
            val_accuracy = val_accuracy_metric.result()
            val_loss = val_loss_metric.result()

            template = f'Arch params updated, Epoch {epoch}, Train Loss: {float(train_loss)}, Train Accuracy: ' \
                f'{float(train_accuracy)}, Val Loss: {float(val_loss)}, Val Accuracy: {float(val_accuracy)}'
            print(template)
            with train_summary_writer.as_default():
                tf.summary.scalar('loss_after_arch_params_update',
                                  train_loss,
                                  step=epoch)
                tf.summary.scalar('accuracy_after_arch_params_update',
                                  train_accuracy,
                                  step=epoch)

            with val_summary_writer.as_default():
                tf.summary.scalar('loss_after_arch_params_update',
                                  val_loss,
                                  step=epoch)
                tf.summary.scalar('accuracy_after_arch_params_update',
                                  val_accuracy,
                                  step=epoch)

            # Resetting metrices for reuse
            accuracy_metric.reset_states()
            loss_metric.reset_states()
            val_accuracy_metric.reset_states()
            val_loss_metric.reset_states()

        define_temperature(new_temperature)

    print("Training Completed!!")

    print("Architecture params: ")
    print(arch_params)
    post_training_analysis(model, args['exported_architecture'])