def main(): """ function called when starting the code via command-line """ args = argparse.parse_cmd(arguments) args['server'] = alchemy_api.start_training(args['server']) # Use weight and biases only use_wandb is true and framework is tensorflow if args['wandb_params']['use_wandb'] and "tensorflow" in args['framework']: import wandb wandb.init(name= args['wandb_params']['run_name'], project=args['wandb_params']['project'], config=args) args = wandb.config train(args)
def main(): """ function called when starting the code via command-line """ args = argparse.parse_cmd(arguments) args['server'] = alchemy_api.start_training(args['server']) train(args)
def train(config): """ This function setup: 1- Tensorflow (XLA, GPU configuration, mixed precision, execution strategies) 2- The datasets 3- The model 4- The execution environment 5- The monitoring (Upstride plateform and tensorboard) Then 6- start the training 7- Export the model """ # 1 global_conf.config_tf2(config['config']) global_conf.setup_mp(config['config']) ds_strategy = global_conf.setup_strategy(config['config']['strategy']) if config['model']['channels_first']: # if True set keras backend to channels_first tf.keras.backend.set_image_data_format('channels_first') # 2 train_dataset = dataloader.get_dataset(config['dataloader'], transformation_list=config['dataloader']['train_list'], num_classes=config['model']["num_classes"], split=config['dataloader']['train_split_id']) val_dataset = dataloader.get_dataset(config['dataloader'], transformation_list=config['dataloader']['val_list'], num_classes=config['model']["num_classes"], split=config['dataloader']['val_split_id']) # 3 with ds_strategy.scope(): model, optimizer = get_compiled_model(config) # 4 checkpoint_dir, log_dir, export_dir = create_env_directories(get_experiment_name(config), config['checkpoint_dir'], config['log_dir'], config['export']['dir']) if not os.path.exists(log_dir): os.makedirs(log_dir) with open(os.path.join(log_dir, "conf.yml"), 'w') as file: yaml.dump(config, file) # 5 config['server'] = alchemy_api.start_training(config['server']) alchemy_api.send_model_info(model, config['server']) callbacks = get_callbacks(config, log_dir) with ds_strategy.scope(): # checkpoints needs to be in the same scope. model_checkpoint_cb, latest_epoch = init_custom_checkpoint_callbacks({'model': model}, checkpoint_dir, config['max_checkpoints'], config['checkpoint_freq']) callbacks.append(model_checkpoint_cb) if config['server']['id'] != '': callbacks.append(alchemy_api.send_metric_callbacks(config['server'])) if config['model']['name'] == 'Pdart': from src.models.pdart import callback_epoch callbacks.append(tf.keras.callbacks.LambdaCallback(on_epoch_begin=lambda epoch, logs: callback_epoch(epoch, config['num_epochs'], config['drop_path_prob']))) # 6 training if config['progressive_resizing']: progressive_training(model=model, config=config, train_dataset=train_dataset, val_dataset=val_dataset, callbacks=callbacks, latest_epoch=latest_epoch, max_queue_size=16, optimizer=optimizer) else: model.fit(x=train_dataset, validation_data=val_dataset, epochs=config['num_epochs'], callbacks=callbacks, max_queue_size=16, initial_epoch=latest_epoch ) # 7 training print("export model") export.export(model, export_dir, config) print("Training Completed!!")