def model_builder(hp): # config["parameters"]["combined_graph_layer"]["hidden_dim"] = hp.Choice("hidden_dim", values=[128]) # config["parameters"]["combined_graph_layer"]["distance_dim"] = hp.Choice("distance_dim", values=[128]) config["parameters"]["combined_graph_layer"][ "num_node_messages"] = hp.Choice("num_node_messages", [1, 2]) config["parameters"]["num_graph_layers_common"] = hp.Choice( "num_graph_layers_common", [1, 2, 3]) config["parameters"]["num_graph_layers_energy"] = hp.Choice( "num_graph_layers_energy", [1, 2, 3]) config["parameters"]["combined_graph_layer"]["dropout"] = hp.Choice( "cg_dropout", values=[0.0, 0.1, 0.2]) config["parameters"]["output_decoding"]["dropout"] = hp.Choice( "output_dropout", values=[0.0, 0.1, 0.2]) config["parameters"]["combined_graph_layer"]["bin_size"] = hp.Choice( "bin_size", values=[160, 320, 640]) config["parameters"]["combined_graph_layer"][ "ffn_dist_hidden_dim"] = hp.Choice("ffn_dist_hidden_dim", values=[64, 128, 256]) config["parameters"]["combined_graph_layer"][ "ffn_dist_num_layers"] = hp.Choice("ffn_dist_num_layers", values=[1, 2]) config["parameters"]["combined_graph_layer"]["kernel"][ "dist_mult"] = hp.Choice("dist_mult", values=[0.01, 0.1, 1.0]) config["parameters"]["combined_graph_layer"]["node_message"][ "output_dim"] = hp.Choice("output_dim", values=[128, 256, 512]) config["parameters"]["combined_graph_layer"]["node_message"][ "normalize_degrees"] = hp.Choice("normalize_degrees", values=[True, False]) # config["setup"]["lr"] = hp.Choice("lr", values=[1e-4, 3e-4]) # config["setup"]["lr_schedule"] = hp.Choice("lr_schedule", values=["exponentialdecay"]) # config["setup"]["optimizer"] = hp.Choice("optimizer", values=["adam"]) model = make_model(config, dtype="float32") model.build((1, config["dataset"]["padded_num_elem_size"], config["dataset"]["num_input_features"])) lr_schedule, _ = get_lr_schedule(config, steps=total_steps) opt = get_optimizer(config, lr_schedule) loss_dict, loss_weights = get_loss_dict(config) model.compile( loss=loss_dict, optimizer=opt, sample_weight_mode="temporal", loss_weights=loss_weights, metrics={ "cls": [ FlattenedCategoricalAccuracy(name="acc_unweighted", dtype=tf.float64), FlattenedCategoricalAccuracy(use_weights=True, name="acc_weighted", dtype=tf.float64), ] }, ) return model
def build_model_and_train(config, checkpoint_dir=None, full_config=None, ntrain=None, ntest=None, name=None, seeds=False): from ray import tune from ray.tune.integration.keras import TuneReportCheckpointCallback from raytune.search_space import set_raytune_search_parameters if seeds: # Set seeds for reproducibility random.seed(1234) np.random.seed(1234) tf.random.set_seed(1234) full_config, config_file_stem = parse_config(full_config) if config is not None: full_config = set_raytune_search_parameters(search_space=config, config=full_config) strategy, num_gpus = get_strategy() ds_train, num_train_steps = get_datasets( full_config["train_test_datasets"], full_config, num_gpus, "train") ds_test, num_test_steps = get_datasets(full_config["train_test_datasets"], full_config, num_gpus, "test") ds_val, ds_info = get_heptfds_dataset( full_config["validation_datasets"][0], full_config, num_gpus, "test", full_config["setup"]["num_events_validation"], supervised=False, ) ds_val = ds_val.batch(5) if ntrain: ds_train = ds_train.take(ntrain) num_train_steps = ntrain if ntest: ds_test = ds_test.take(ntest) num_test_steps = ntest print("num_train_steps", num_train_steps) print("num_test_steps", num_test_steps) total_steps = num_train_steps * full_config["setup"]["num_epochs"] print("total_steps", total_steps) callbacks = prepare_callbacks( full_config, tune.get_trial_dir(), ds_val, ) callbacks = callbacks[: -1] # remove the CustomCallback at the end of the list with strategy.scope(): lr_schedule, optim_callbacks = get_lr_schedule(full_config, steps=total_steps) callbacks.append(optim_callbacks) opt = get_optimizer(full_config, lr_schedule) model = make_model(full_config, dtype=tf.dtypes.float32) # Run model once to build the layers model.build((1, full_config["dataset"]["padded_num_elem_size"], full_config["dataset"]["num_input_features"])) full_config = set_config_loss(full_config, full_config["setup"]["trainable"]) configure_model_weights(model, full_config["setup"]["trainable"]) model.build((1, full_config["dataset"]["padded_num_elem_size"], full_config["dataset"]["num_input_features"])) loss_dict, loss_weights = get_loss_dict(full_config) model.compile( loss=loss_dict, optimizer=opt, sample_weight_mode="temporal", loss_weights=loss_weights, metrics={ "cls": [ FlattenedCategoricalAccuracy(name="acc_unweighted", dtype=tf.float64), FlattenedCategoricalAccuracy(use_weights=True, name="acc_weighted", dtype=tf.float64), ] }, ) model.summary() callbacks.append( TuneReportCheckpointCallback(metrics=[ "adam_beta_1", "charge_loss", "cls_acc_unweighted", "cls_loss", "cos_phi_loss", "energy_loss", "eta_loss", "learning_rate", "loss", "pt_loss", "sin_phi_loss", "val_charge_loss", "val_cls_acc_unweighted", "val_cls_acc_weighted", "val_cls_loss", "val_cos_phi_loss", "val_energy_loss", "val_eta_loss", "val_loss", "val_pt_loss", "val_sin_phi_loss", ], ), ) try: model.fit( ds_train.repeat(), validation_data=ds_test.repeat(), epochs=full_config["setup"]["num_epochs"], callbacks=callbacks, steps_per_epoch=num_train_steps, validation_steps=num_test_steps, ) except tf.errors.ResourceExhaustedError: logging.warning( "Resource exhausted, skipping this hyperparameter configuration." ) skiplog_file_path = Path(full_config["raytune"]["local_dir"] ) / name / "skipped_configurations.txt" lines = [ "{}: {}\n".format(item[0], item[1]) for item in config.items() ] with open(skiplog_file_path, "a") as f: f.write("#" * 80 + "\n") for line in lines: f.write(line) logging.warning(line[:-1]) f.write("#" * 80 + "\n\n")
def model_scope(config, total_steps, weights, horovod_enabled=False): lr_schedule, optim_callbacks, lr = get_lr_schedule(config, steps=total_steps) opt = get_optimizer(config, lr_schedule) if config["setup"]["dtype"] == "float16": model_dtype = tf.dtypes.float16 policy = mixed_precision.Policy("mixed_float16") mixed_precision.set_global_policy(policy) opt = mixed_precision.LossScaleOptimizer(opt) else: model_dtype = tf.dtypes.float32 model = make_model(config, model_dtype) # Build the layers after the element and feature dimensions are specified model.build((1, config["dataset"]["padded_num_elem_size"], config["dataset"]["num_input_features"])) initial_epoch = 0 loaded_opt = None if weights: if lr_schedule: raise Exception( "Restoring the optimizer state with a learning rate schedule is currently not supported" ) # We need to load the weights in the same trainable configuration as the model was set up configure_model_weights(model, config["setup"].get("weights_config", "all")) model.load_weights(weights, by_name=True) opt_weight_file = weights.replace("hdf5", "pkl").replace("/weights-", "/opt-") if os.path.isfile(opt_weight_file): loaded_opt = pickle.load(open(opt_weight_file, "rb")) initial_epoch = int(weights.split("/")[-1].split("-")[1]) model.build((1, config["dataset"]["padded_num_elem_size"], config["dataset"]["num_input_features"])) config = set_config_loss(config, config["setup"]["trainable"]) configure_model_weights(model, config["setup"]["trainable"]) model.build((1, config["dataset"]["padded_num_elem_size"], config["dataset"]["num_input_features"])) print("model weights") tw_names = [m.name for m in model.trainable_weights] for w in model.weights: print("layer={} trainable={} shape={} num_weights={}".format( w.name, w.name in tw_names, w.shape, np.prod(w.shape))) loss_dict, loss_weights = get_loss_dict(config) model.compile( loss=loss_dict, optimizer=opt, sample_weight_mode="temporal", loss_weights=loss_weights, metrics={ "cls": [ FlattenedCategoricalAccuracy(name="acc_unweighted", dtype=tf.float64), FlattenedCategoricalAccuracy( use_weights=True, name="acc_weighted", dtype=tf.float64), ] + [ SingleClassRecall( icls, name="rec_cls{}".format(icls), dtype=tf.float64) for icls in range(config["dataset"]["num_output_classes"]) ] }, ) model.summary() # Set the optimizer weights if loaded_opt: def model_weight_setting(): grad_vars = model.trainable_weights zero_grads = [tf.zeros_like(w) for w in grad_vars] model.optimizer.apply_gradients(zip(zero_grads, grad_vars)) if model.optimizer.__class__.__module__ == "keras.optimizers.optimizer_v1": model.optimizer.optimizer.optimizer.set_weights( loaded_opt["weights"]) else: model.optimizer.set_weights(loaded_opt["weights"]) # FIXME: check that this still works with multiple GPUs strategy = tf.distribute.get_strategy() strategy.run(model_weight_setting) return model, optim_callbacks, initial_epoch
def train(config, weights, ntrain, ntest, nepochs, recreate, prefix, plot_freq, customize): try: from comet_ml import Experiment experiment = Experiment( project_name="particleflow-tf", auto_metric_logging=True, auto_param_logging=True, auto_histogram_weight_logging=True, auto_histogram_gradient_logging=False, auto_histogram_activation_logging=False, ) except Exception as e: print("Failed to initialize comet-ml dashboard") experiment = None """Train a model defined by config""" config_file_path = config config, config_file_stem = parse_config(config, nepochs=nepochs, weights=weights) if plot_freq: config["callbacks"]["plot_freq"] = plot_freq if customize: config = customization_functions[customize](config) if recreate or (weights is None): outdir = create_experiment_dir(prefix=prefix + config_file_stem + "_", suffix=platform.node()) else: outdir = str(Path(weights).parent) # Decide tf.distribute.strategy depending on number of available GPUs strategy, num_gpus = get_strategy() #if "CPU" not in strategy.extended.worker_devices[0]: # nvidia_smi_call = "nvidia-smi --query-gpu=timestamp,name,pci.bus_id,pstate,power.draw,temperature.gpu,utilization.gpu,utilization.memory,memory.total,memory.free,memory.used --format=csv -l 1 -f {}/nvidia_smi_log.csv".format(outdir) # p = subprocess.Popen(shlex.split(nvidia_smi_call)) ds_train, num_train_steps = get_datasets(config["train_test_datasets"], config, num_gpus, "train") ds_test, num_test_steps = get_datasets(config["train_test_datasets"], config, num_gpus, "test") ds_val, ds_info = get_heptfds_dataset( config["validation_dataset"], config, num_gpus, "test", config["setup"]["num_events_validation"]) ds_val = ds_val.batch(5) if ntrain: ds_train = ds_train.take(ntrain) num_train_steps = ntrain if ntest: ds_test = ds_test.take(ntest) num_test_steps = ntest print("num_train_steps", num_train_steps) print("num_test_steps", num_test_steps) total_steps = num_train_steps * config["setup"]["num_epochs"] print("total_steps", total_steps) if experiment: experiment.set_name(outdir) experiment.log_code("mlpf/tfmodel/model.py") experiment.log_code("mlpf/tfmodel/utils.py") experiment.log_code(config_file_path) shutil.copy(config_file_path, outdir + "/config.yaml" ) # Copy the config file to the train dir for later reference with strategy.scope(): lr_schedule, optim_callbacks = get_lr_schedule(config, steps=total_steps) opt = get_optimizer(config, lr_schedule) if config["setup"]["dtype"] == "float16": model_dtype = tf.dtypes.float16 policy = mixed_precision.Policy("mixed_float16") mixed_precision.set_global_policy(policy) opt = mixed_precision.LossScaleOptimizer(opt) else: model_dtype = tf.dtypes.float32 model = make_model(config, model_dtype) # Build the layers after the element and feature dimensions are specified model.build((1, config["dataset"]["padded_num_elem_size"], config["dataset"]["num_input_features"])) initial_epoch = 0 if weights: # We need to load the weights in the same trainable configuration as the model was set up configure_model_weights( model, config["setup"].get("weights_config", "all")) model.load_weights(weights, by_name=True) initial_epoch = int(weights.split("/")[-1].split("-")[1]) model.build((1, config["dataset"]["padded_num_elem_size"], config["dataset"]["num_input_features"])) config = set_config_loss(config, config["setup"]["trainable"]) configure_model_weights(model, config["setup"]["trainable"]) model.build((1, config["dataset"]["padded_num_elem_size"], config["dataset"]["num_input_features"])) print("model weights") tw_names = [m.name for m in model.trainable_weights] for w in model.weights: print("layer={} trainable={} shape={} num_weights={}".format( w.name, w.name in tw_names, w.shape, np.prod(w.shape))) loss_dict, loss_weights = get_loss_dict(config) model.compile( loss=loss_dict, optimizer=opt, sample_weight_mode="temporal", loss_weights=loss_weights, metrics={ "cls": [ FlattenedCategoricalAccuracy(name="acc_unweighted", dtype=tf.float64), FlattenedCategoricalAccuracy(use_weights=True, name="acc_weighted", dtype=tf.float64), ] + [ SingleClassRecall( icls, name="rec_cls{}".format(icls), dtype=tf.float64) for icls in range(config["dataset"]["num_output_classes"]) ] }, ) model.summary() callbacks = prepare_callbacks(config["callbacks"], outdir, ds_val, ds_info, comet_experiment=experiment) callbacks.append(optim_callbacks) fit_result = model.fit( ds_train.repeat(), validation_data=ds_test.repeat(), epochs=initial_epoch + config["setup"]["num_epochs"], callbacks=callbacks, steps_per_epoch=num_train_steps, validation_steps=num_test_steps, initial_epoch=initial_epoch, ) history_path = Path(outdir) / "history" history_path = str(history_path) with open("{}/history.json".format(history_path), "w") as fi: json.dump(fit_result.history, fi) weights = get_best_checkpoint(outdir) print("Loading best weights that could be found from {}".format(weights)) model.load_weights(weights, by_name=True) model.save(outdir + "/model_full", save_format="tf") print("Training done.")