def train(args):
    print(args)
    global_conf.config_tf2(args)
    checkpoint_dir, log_dir, export_dir = create_env_directories(
        args, get_experiment_name(args))

    train_dataset = dataloader.get_dataset(
        args['dataloader'],
        transformation_list=args['dataloader']['train_list'],
        num_classes=args["num_classes"],
        split=args['dataloader']['train_split_id'])
    val_dataset = dataloader.get_dataset(
        args['dataloader'],
        transformation_list=args['dataloader']['val_list'],
        num_classes=args["num_classes"],
        split=args['dataloader']['val_split_id'])

    setup_mp(args)
    build_model_fn = get_model(args)
    callbacks = get_callbacks(args, log_dir)

    # tuner = Hyperband(build_model_fn,
    #                   objective='val_accuracy',
    #                   max_epochs=args['num_epochs'],
    #                   hyperband_iterations=10e100,
    #                   directory=checkpoint_dir)

    tuner = BayesianOptimization(build_model_fn,
                                 objective='val_accuracy',
                                 max_trials=100000,
                                 num_initial_points=10,
                                 directory=checkpoint_dir)

    tuner.search_space_summary()
    tuner.search(x=train_dataset,
                 validation_data=val_dataset,
                 callbacks=callbacks,
                 epochs=args['num_epochs'])
    tuner.results_summary()
Example #2
0
def train(args):
  print(args)
  global_conf.config_tf2(args)
  checkpoint_dir, log_dir, export_dir = create_env_directories(args, get_experiment_name(args))

  train_dataset = dataloader.get_dataset(args['dataloader'], transformation_list=args['dataloader']['train_list'],
                                         num_classes=args["num_classes"], split=args['dataloader']['train_split_id'])
  val_dataset = dataloader.get_dataset(args['dataloader'], transformation_list=args['dataloader']['val_list'],
                                       num_classes=args["num_classes"], split=args['dataloader']['val_split_id'])

  setup_mp(args)
  model, _ = define_model_in_strategy(args, get_model)
  alchemy_api.send_model_info(model, args['server'])
  callbacks = get_callbacks(args, log_dir)

  model_checkpoint_cb, latest_epoch = init_custom_checkpoint_callbacks({'model': model}, checkpoint_dir)
  callbacks.append(model_checkpoint_cb)
  if args['server']['id'] != '':
    callbacks.append(alchemy_api.send_metric_callbacks(args['server']))
  if args['model_name'] == 'Pdart':
    from src.models.pdart import callback_epoch
    callbacks.append(tf.keras.callbacks.LambdaCallback(on_epoch_begin=lambda epoch, logs: callback_epoch(epoch, args['num_epochs'], args['drop_path_prob'])))
  # Use weight and biases only use_wandb is true and framework is tensorflow
  if args['wandb_params']['use_wandb'] and 'tensorflow' in args['framework']:
    from wandb.keras import WandbCallback
    callbacks.append(WandbCallback())
  model.fit(x=train_dataset,
            validation_data=val_dataset,
            epochs=args['num_epochs'],
            callbacks=callbacks,
            max_queue_size=16,
            initial_epoch=latest_epoch
            )
  print("export model")
  export.export(model, export_dir, args)
  print("Training Completed!!")
Example #3
0
def train(config):
  """
  This function setup:
    1- Tensorflow (XLA, GPU configuration, mixed precision, execution strategies)
    2- The datasets
    3- The model
    4- The execution environment
    5- The monitoring (Upstride plateform and tensorboard)

  Then 
    6- start the training
    7- Export the model
  """

  # 1
  global_conf.config_tf2(config['config'])
  global_conf.setup_mp(config['config'])
  ds_strategy = global_conf.setup_strategy(config['config']['strategy'])
  if config['model']['channels_first']:  # if True set keras backend to channels_first
    tf.keras.backend.set_image_data_format('channels_first')

  # 2
  train_dataset = dataloader.get_dataset(config['dataloader'], transformation_list=config['dataloader']['train_list'],
                                         num_classes=config['model']["num_classes"], split=config['dataloader']['train_split_id'])
  val_dataset = dataloader.get_dataset(config['dataloader'], transformation_list=config['dataloader']['val_list'],
                                       num_classes=config['model']["num_classes"], split=config['dataloader']['val_split_id'])

  # 3
  with ds_strategy.scope():
    model, optimizer = get_compiled_model(config)

  # 4
  checkpoint_dir, log_dir, export_dir = create_env_directories(get_experiment_name(config), config['checkpoint_dir'], config['log_dir'], config['export']['dir'])
  if not os.path.exists(log_dir):
    os.makedirs(log_dir)
  with open(os.path.join(log_dir, "conf.yml"), 'w') as file:
    yaml.dump(config, file)

  # 5
  config['server'] = alchemy_api.start_training(config['server'])
  alchemy_api.send_model_info(model, config['server'])
  callbacks = get_callbacks(config, log_dir)

  with ds_strategy.scope(): # checkpoints needs to be in the same scope.
    model_checkpoint_cb, latest_epoch = init_custom_checkpoint_callbacks({'model': model}, checkpoint_dir, config['max_checkpoints'], config['checkpoint_freq'])

  callbacks.append(model_checkpoint_cb)
  if config['server']['id'] != '':
    callbacks.append(alchemy_api.send_metric_callbacks(config['server']))

  if config['model']['name'] == 'Pdart':
    from src.models.pdart import callback_epoch
    callbacks.append(tf.keras.callbacks.LambdaCallback(on_epoch_begin=lambda epoch, logs: callback_epoch(epoch, config['num_epochs'], config['drop_path_prob'])))

  # 6 training
  if config['progressive_resizing']:
    progressive_training(model=model,
                         config=config,
                         train_dataset=train_dataset,
                         val_dataset=val_dataset,
                         callbacks=callbacks,
                         latest_epoch=latest_epoch,
                         max_queue_size=16,
                         optimizer=optimizer)
  else:
    model.fit(x=train_dataset,
              validation_data=val_dataset,
              epochs=config['num_epochs'],
              callbacks=callbacks,
              max_queue_size=16,
              initial_epoch=latest_epoch
              )


  # 7 training
  print("export model")
  export.export(model, export_dir, config)
  print("Training Completed!!")
Example #4
0
def train(args):
    # config_tf2(args['configuration']['xla'])
    # Create log, checkpoint and export directories
    checkpoint_dir, log_dir, export_dir = create_env_directories(
        args, get_experiment_name(args))
    train_log_dir = os.path.join(log_dir, 'train')
    val_log_dir = os.path.join(log_dir, 'validation')
    arch_log_dir = os.path.join(log_dir, 'arch')
    summary_writers = {
        'train': tf.summary.create_file_writer(train_log_dir),
        'val': tf.summary.create_file_writer(val_log_dir),
        'arch': tf.summary.create_file_writer(arch_log_dir)
    }

    # Prepare the 3 datasets
    train_weight_dataset = dataloader.get_dataset(
        args['dataloader'],
        transformation_list=args['dataloader']['train_list'],
        num_classes=args["num_classes"],
        split='train_weights')
    train_arch_dataset = dataloader.get_dataset(
        args['dataloader'],
        transformation_list=args['dataloader']['train_list'],
        num_classes=args["num_classes"],
        split='train_arch')
    val_dataset = dataloader.get_dataset(
        args['dataloader'],
        transformation_list=args['dataloader']['val_list'],
        num_classes=args["num_classes"],
        split='test')

    # define model, optimizer and checkpoint callback
    setup_mp(args)
    model = model_name_to_class[args['model_name']](
        args['framework'],
        input_shape=args['input_size'],
        label_dim=args['num_classes']).model
    model.summary()

    alchemy_api.send_model_info(model, args['server'])
    weights, arch_params = fbnetv2.split_trainable_weights(model)
    weight_opt = get_optimizer(args['optimizer'])
    arch_opt = get_optimizer(args['arch_search']['optimizer'])
    model_checkpoint_cb, latest_epoch = init_custom_checkpoint_callbacks(
        {'model': model}, checkpoint_dir, args['max_checkpoints'],
        args['checkpoint_freq'])
    callbacks = [model_checkpoint_cb]

    temperature_decay_fn = fbnetv2.exponential_decay(
        args['arch_search']['temperature']['init_value'],
        args['arch_search']['temperature']['decay_steps'],
        args['arch_search']['temperature']['decay_rate'])

    lr_decay_fn = CosineDecay(
        args['optimizer']['lr'],
        alpha=args["optimizer"]["lr_decay_strategy"]["lr_params"]["alpha"],
        total_epochs=args['num_epochs'])

    lr_decay_fn_arch = CosineDecay(args['arch_search']['optimizer']['lr'],
                                   alpha=0.000001,
                                   total_epochs=args['num_epochs'])

    metrics = {
        'arch': {
            'latency_reg_loss': tf.keras.metrics.Mean()
        },
        'train': {
            'total_loss': tf.keras.metrics.Mean(),
            'accuracy': tf.keras.metrics.CategoricalAccuracy(),
            'cross_entropy_loss': tf.keras.metrics.Mean(),
        },
        'val': {
            'accuracy': tf.keras.metrics.CategoricalAccuracy(),
            'cross_entropy_loss': tf.keras.metrics.Mean(),
        }
    }

    train_step = get_train_step_function(model, weights, weight_opt,
                                         metrics['train'])
    train_step_arch = get_train_step_arch_function(model, arch_params,
                                                   arch_opt, metrics['train'],
                                                   metrics['arch'])
    evaluation_step = get_eval_step_function(model, metrics['val'])

    for epoch in range(latest_epoch, args['num_epochs']):
        print(f'Epoch: {epoch}/{args["num_epochs"]}')
        # Update both LR
        weight_opt.learning_rate = lr_decay_fn(epoch)
        arch_opt.learning_rate = lr_decay_fn_arch(epoch)
        # Updating the weight parameters using a subset of the training data
        for step, (x_batch, y_batch) in tqdm.tqdm(
                enumerate(train_weight_dataset, start=1)):
            train_step(x_batch, y_batch)
        # Evaluate the model on validation subset
        for x_batch, y_batch in val_dataset:
            evaluation_step(x_batch, y_batch)
        # Handle metrics
        template = f"Weights updated, Epoch {epoch}"
        template = metrics_processing(metrics, summary_writers,
                                      ['train', 'val'], template, epoch)
        template += f", lr: {float(weight_opt.learning_rate)}"
        print(template)

        new_temperature = temperature_decay_fn(epoch)
        with summary_writers['train'].as_default():
            tf.summary.scalar('temperature', new_temperature, step=epoch)
        define_temperature(new_temperature)

        if epoch >= args['arch_search']['num_warmup']:
            # Updating the architectural parameters on another subset
            for step, (x_batch, y_batch) in tqdm.tqdm(
                    enumerate(train_arch_dataset, start=1)):
                train_step_arch(x_batch, y_batch)
            # Evaluate the model on validation subset
            for x_batch, y_batch in val_dataset:
                evaluation_step(x_batch, y_batch)
            # Handle metrics
            template = f'Architecture updated, Epoch {epoch}'
            template = metrics_processing(metrics,
                                          summary_writers,
                                          ['train', 'val', 'arch'],
                                          template,
                                          epoch,
                                          postfix='_arch')
            template += f", lr: {float(arch_opt.learning_rate)}"
            print(template)
        # move saved outside of condition so we save starting from the begining
        fbnetv2.save_arch_params(model, epoch, log_dir)

        # manually call the callbacks
        for callback in callbacks:
            callback.on_epoch_end(epoch, logs=None)

    print("Training Completed!!")

    print("Architecture params: ")
    print(arch_params)
    fbnetv2.post_training_analysis(
        model, args['arch_search']['exported_architecture'])
Example #5
0
def train(args):
    # config_tf2(args['configuration']['xla'])
    # Create log, checkpoint and export directories
    checkpoint_dir, log_dir, export_dir = create_env_directories(
        args, get_experiment_name(args))

    train_weight_dataset = dataloader.get_dataset(
        args['dataloader'],
        transformation_list=args['dataloader']['train_list'],
        num_classes=args["num_classes"],
        split='train_weights')
    train_arch_dataset = dataloader.get_dataset(
        args['dataloader'],
        transformation_list=args['dataloader']['train_list'],
        num_classes=args["num_classes"],
        split='train_arch')
    val_dataset = dataloader.get_dataset(
        args['dataloader'],
        transformation_list=args['dataloader']['val_list'],
        num_classes=args["num_classes"],
        split='validation')

    setup_mp(args)

    # define model, optimizer and checkpoint callback
    model = model_name_to_class[args['model_name']](
        args['framework'],
        input_shape=args['input_size'],
        label_dim=args['num_classes']).model
    model.summary()
    alchemy_api.send_model_info(model, args['server'])
    weight_opt = get_optimizer(args['optimizer'])
    arch_opt = get_optimizer(args['arch_optimizer_param'])
    model_checkpoint_cb, latest_epoch = init_custom_checkpoint_callbacks(
        {'model': model}, checkpoint_dir)

    weights, arch_params = split_trainable_weights(model)
    temperature_decay_fn = exponential_decay(
        args['temperature']['init_value'], args['temperature']['decay_steps'],
        args['temperature']['decay_rate'])

    lr_decay_fn = CosineDecay(
        args['optimizer']['lr'],
        alpha=args["optimizer"]["lr_decay_strategy"]["lr_params"]["alpha"],
        total_epochs=args['num_epochs'])

    loss_fn = CategoricalCrossentropy()
    accuracy_metric = CategoricalAccuracy()
    loss_metric = Mean()
    val_accuracy_metric = CategoricalAccuracy()
    val_loss_metric = Mean()

    train_log_dir = os.path.join(args['log_dir'], 'train')
    val_log_dir = os.path.join(args['log_dir'], 'validation')
    train_summary_writer = tf.summary.create_file_writer(train_log_dir)
    val_summary_writer = tf.summary.create_file_writer(val_log_dir)

    @tf.function
    def train_step(x_batch, y_batch):
        with tf.GradientTape() as tape:
            y_hat = model(x_batch, training=True)
            loss = loss_fn(y_batch, y_hat)

        accuracy_metric.update_state(y_batch, y_hat)
        loss_metric.update_state(loss)
        grads = tape.gradient(loss, weights)
        weight_opt.apply_gradients(zip(grads, weights))

    @tf.function
    def train_step_arch(x_batch, y_batch):
        with tf.GradientTape() as tape:
            y_hat = model(x_batch, training=False)
            loss = loss_fn(y_batch, y_hat)

        accuracy_metric.update_state(y_batch, y_hat)
        loss_metric.update_state(loss)
        grads = tape.gradient(loss, arch_params)
        arch_opt.apply_gradients(zip(grads, arch_params))

    @tf.function
    def evaluation_step(x_batch, y_batch):
        y_hat = model(x_batch, training=False)
        loss = loss_fn(y_batch, y_hat)

        val_accuracy_metric.update_state(y_batch, y_hat)
        val_loss_metric.update_state(loss)

    for epoch in range(latest_epoch, args['num_epochs']):
        print(f'Epoch: {epoch}/{args["num_epochs"]}')

        weight_opt.learning_rate = lr_decay_fn(epoch)

        # Updating the weight parameters using a subset of the training data
        for step, (x_batch, y_batch) in tqdm.tqdm(
                enumerate(train_weight_dataset, start=1)):
            train_step(x_batch, y_batch)

        # Evaluate the model on validation subset
        for x_batch, y_batch in val_dataset:
            evaluation_step(x_batch, y_batch)

        train_accuracy = accuracy_metric.result()
        train_loss = loss_metric.result()
        val_accuracy = val_accuracy_metric.result()
        val_loss = val_loss_metric.result()

        template = f'Weights updated, Epoch {epoch}, Train Loss: {float(train_loss)}, Train Accuracy: ' \
            f'{float(train_accuracy)}, Val Loss: {float(val_loss)}, Val Accuracy: {float(val_accuracy)}, ' \
            f'lr: {float(weight_opt.learning_rate)}'
        print(template)

        new_temperature = temperature_decay_fn(epoch)

        with train_summary_writer.as_default():
            tf.summary.scalar('loss', train_loss, step=epoch)
            tf.summary.scalar('accuracy', train_accuracy, step=epoch)
            tf.summary.scalar('temperature', new_temperature, step=epoch)

        with val_summary_writer.as_default():
            tf.summary.scalar('loss', val_loss, step=epoch)
            tf.summary.scalar('accuracy', val_accuracy, step=epoch)

        # Resetting metrices for reuse
        accuracy_metric.reset_states()
        loss_metric.reset_states()
        val_accuracy_metric.reset_states()
        val_loss_metric.reset_states()

        if epoch >= 10:
            # Updating the architectural parameters on another subset
            for step, (x_batch, y_batch) in tqdm.tqdm(
                    enumerate(train_arch_dataset, start=1)):
                train_step_arch(x_batch, y_batch)

            # Evaluate the model on validation subset
            for x_batch, y_batch in val_dataset:
                evaluation_step(x_batch, y_batch)

            train_accuracy = accuracy_metric.result()
            train_loss = loss_metric.result()
            val_accuracy = val_accuracy_metric.result()
            val_loss = val_loss_metric.result()

            template = f'Arch params updated, Epoch {epoch}, Train Loss: {float(train_loss)}, Train Accuracy: ' \
                f'{float(train_accuracy)}, Val Loss: {float(val_loss)}, Val Accuracy: {float(val_accuracy)}'
            print(template)
            with train_summary_writer.as_default():
                tf.summary.scalar('loss_after_arch_params_update',
                                  train_loss,
                                  step=epoch)
                tf.summary.scalar('accuracy_after_arch_params_update',
                                  train_accuracy,
                                  step=epoch)

            with val_summary_writer.as_default():
                tf.summary.scalar('loss_after_arch_params_update',
                                  val_loss,
                                  step=epoch)
                tf.summary.scalar('accuracy_after_arch_params_update',
                                  val_accuracy,
                                  step=epoch)

            # Resetting metrices for reuse
            accuracy_metric.reset_states()
            loss_metric.reset_states()
            val_accuracy_metric.reset_states()
            val_loss_metric.reset_states()

        define_temperature(new_temperature)

    print("Training Completed!!")

    print("Architecture params: ")
    print(arch_params)
    post_training_analysis(model, args['exported_architecture'])