Ejemplo n.º 1
0
def main():
  """ function called when starting the code via command-line
  """
  args = argparse.parse_cmd(arguments)
  args['server'] = alchemy_api.start_training(args['server'])
  # Use weight and biases only use_wandb is true and framework is tensorflow
  if args['wandb_params']['use_wandb'] and "tensorflow" in args['framework']:
    import wandb
    wandb.init(name= args['wandb_params']['run_name'], project=args['wandb_params']['project'], config=args)
    args = wandb.config
  train(args)
Ejemplo n.º 2
0
def main():
    """ function called when starting the code via command-line
  """
    args = argparse.parse_cmd(arguments)
    args['server'] = alchemy_api.start_training(args['server'])
    train(args)
Ejemplo n.º 3
0
def train(config):
  """
  This function setup:
    1- Tensorflow (XLA, GPU configuration, mixed precision, execution strategies)
    2- The datasets
    3- The model
    4- The execution environment
    5- The monitoring (Upstride plateform and tensorboard)

  Then 
    6- start the training
    7- Export the model
  """

  # 1
  global_conf.config_tf2(config['config'])
  global_conf.setup_mp(config['config'])
  ds_strategy = global_conf.setup_strategy(config['config']['strategy'])
  if config['model']['channels_first']:  # if True set keras backend to channels_first
    tf.keras.backend.set_image_data_format('channels_first')

  # 2
  train_dataset = dataloader.get_dataset(config['dataloader'], transformation_list=config['dataloader']['train_list'],
                                         num_classes=config['model']["num_classes"], split=config['dataloader']['train_split_id'])
  val_dataset = dataloader.get_dataset(config['dataloader'], transformation_list=config['dataloader']['val_list'],
                                       num_classes=config['model']["num_classes"], split=config['dataloader']['val_split_id'])

  # 3
  with ds_strategy.scope():
    model, optimizer = get_compiled_model(config)

  # 4
  checkpoint_dir, log_dir, export_dir = create_env_directories(get_experiment_name(config), config['checkpoint_dir'], config['log_dir'], config['export']['dir'])
  if not os.path.exists(log_dir):
    os.makedirs(log_dir)
  with open(os.path.join(log_dir, "conf.yml"), 'w') as file:
    yaml.dump(config, file)

  # 5
  config['server'] = alchemy_api.start_training(config['server'])
  alchemy_api.send_model_info(model, config['server'])
  callbacks = get_callbacks(config, log_dir)

  with ds_strategy.scope(): # checkpoints needs to be in the same scope.
    model_checkpoint_cb, latest_epoch = init_custom_checkpoint_callbacks({'model': model}, checkpoint_dir, config['max_checkpoints'], config['checkpoint_freq'])

  callbacks.append(model_checkpoint_cb)
  if config['server']['id'] != '':
    callbacks.append(alchemy_api.send_metric_callbacks(config['server']))

  if config['model']['name'] == 'Pdart':
    from src.models.pdart import callback_epoch
    callbacks.append(tf.keras.callbacks.LambdaCallback(on_epoch_begin=lambda epoch, logs: callback_epoch(epoch, config['num_epochs'], config['drop_path_prob'])))

  # 6 training
  if config['progressive_resizing']:
    progressive_training(model=model,
                         config=config,
                         train_dataset=train_dataset,
                         val_dataset=val_dataset,
                         callbacks=callbacks,
                         latest_epoch=latest_epoch,
                         max_queue_size=16,
                         optimizer=optimizer)
  else:
    model.fit(x=train_dataset,
              validation_data=val_dataset,
              epochs=config['num_epochs'],
              callbacks=callbacks,
              max_queue_size=16,
              initial_epoch=latest_epoch
              )


  # 7 training
  print("export model")
  export.export(model, export_dir, config)
  print("Training Completed!!")