Beispiel #1
0
    def model_builder(hp):
        # config["parameters"]["combined_graph_layer"]["hidden_dim"] = hp.Choice("hidden_dim", values=[128])
        # config["parameters"]["combined_graph_layer"]["distance_dim"] = hp.Choice("distance_dim", values=[128])
        config["parameters"]["combined_graph_layer"][
            "num_node_messages"] = hp.Choice("num_node_messages", [1, 2])
        config["parameters"]["num_graph_layers_common"] = hp.Choice(
            "num_graph_layers_common", [1, 2, 3])
        config["parameters"]["num_graph_layers_energy"] = hp.Choice(
            "num_graph_layers_energy", [1, 2, 3])
        config["parameters"]["combined_graph_layer"]["dropout"] = hp.Choice(
            "cg_dropout", values=[0.0, 0.1, 0.2])
        config["parameters"]["output_decoding"]["dropout"] = hp.Choice(
            "output_dropout", values=[0.0, 0.1, 0.2])
        config["parameters"]["combined_graph_layer"]["bin_size"] = hp.Choice(
            "bin_size", values=[160, 320, 640])
        config["parameters"]["combined_graph_layer"][
            "ffn_dist_hidden_dim"] = hp.Choice("ffn_dist_hidden_dim",
                                               values=[64, 128, 256])
        config["parameters"]["combined_graph_layer"][
            "ffn_dist_num_layers"] = hp.Choice("ffn_dist_num_layers",
                                               values=[1, 2])
        config["parameters"]["combined_graph_layer"]["kernel"][
            "dist_mult"] = hp.Choice("dist_mult", values=[0.01, 0.1, 1.0])
        config["parameters"]["combined_graph_layer"]["node_message"][
            "output_dim"] = hp.Choice("output_dim", values=[128, 256, 512])
        config["parameters"]["combined_graph_layer"]["node_message"][
            "normalize_degrees"] = hp.Choice("normalize_degrees",
                                             values=[True, False])

        # config["setup"]["lr"] = hp.Choice("lr", values=[1e-4, 3e-4])
        # config["setup"]["lr_schedule"] = hp.Choice("lr_schedule", values=["exponentialdecay"])
        # config["setup"]["optimizer"] = hp.Choice("optimizer", values=["adam"])

        model = make_model(config, dtype="float32")
        model.build((1, config["dataset"]["padded_num_elem_size"],
                     config["dataset"]["num_input_features"]))

        lr_schedule, _ = get_lr_schedule(config, steps=total_steps)
        opt = get_optimizer(config, lr_schedule)

        loss_dict, loss_weights = get_loss_dict(config)
        model.compile(
            loss=loss_dict,
            optimizer=opt,
            sample_weight_mode="temporal",
            loss_weights=loss_weights,
            metrics={
                "cls": [
                    FlattenedCategoricalAccuracy(name="acc_unweighted",
                                                 dtype=tf.float64),
                    FlattenedCategoricalAccuracy(use_weights=True,
                                                 name="acc_weighted",
                                                 dtype=tf.float64),
                ]
            },
        )
        return model
Beispiel #2
0
def build_model_and_train(config,
                          checkpoint_dir=None,
                          full_config=None,
                          ntrain=None,
                          ntest=None,
                          name=None,
                          seeds=False):
    from ray import tune
    from ray.tune.integration.keras import TuneReportCheckpointCallback
    from raytune.search_space import set_raytune_search_parameters

    if seeds:
        # Set seeds for reproducibility
        random.seed(1234)
        np.random.seed(1234)
        tf.random.set_seed(1234)

    full_config, config_file_stem = parse_config(full_config)

    if config is not None:
        full_config = set_raytune_search_parameters(search_space=config,
                                                    config=full_config)

    strategy, num_gpus = get_strategy()

    ds_train, num_train_steps = get_datasets(
        full_config["train_test_datasets"], full_config, num_gpus, "train")
    ds_test, num_test_steps = get_datasets(full_config["train_test_datasets"],
                                           full_config, num_gpus, "test")
    ds_val, ds_info = get_heptfds_dataset(
        full_config["validation_datasets"][0],
        full_config,
        num_gpus,
        "test",
        full_config["setup"]["num_events_validation"],
        supervised=False,
    )
    ds_val = ds_val.batch(5)

    if ntrain:
        ds_train = ds_train.take(ntrain)
        num_train_steps = ntrain
    if ntest:
        ds_test = ds_test.take(ntest)
        num_test_steps = ntest

    print("num_train_steps", num_train_steps)
    print("num_test_steps", num_test_steps)
    total_steps = num_train_steps * full_config["setup"]["num_epochs"]
    print("total_steps", total_steps)

    callbacks = prepare_callbacks(
        full_config,
        tune.get_trial_dir(),
        ds_val,
    )

    callbacks = callbacks[:
                          -1]  # remove the CustomCallback at the end of the list

    with strategy.scope():
        lr_schedule, optim_callbacks = get_lr_schedule(full_config,
                                                       steps=total_steps)
        callbacks.append(optim_callbacks)
        opt = get_optimizer(full_config, lr_schedule)

        model = make_model(full_config, dtype=tf.dtypes.float32)

        # Run model once to build the layers
        model.build((1, full_config["dataset"]["padded_num_elem_size"],
                     full_config["dataset"]["num_input_features"]))

        full_config = set_config_loss(full_config,
                                      full_config["setup"]["trainable"])
        configure_model_weights(model, full_config["setup"]["trainable"])
        model.build((1, full_config["dataset"]["padded_num_elem_size"],
                     full_config["dataset"]["num_input_features"]))

        loss_dict, loss_weights = get_loss_dict(full_config)
        model.compile(
            loss=loss_dict,
            optimizer=opt,
            sample_weight_mode="temporal",
            loss_weights=loss_weights,
            metrics={
                "cls": [
                    FlattenedCategoricalAccuracy(name="acc_unweighted",
                                                 dtype=tf.float64),
                    FlattenedCategoricalAccuracy(use_weights=True,
                                                 name="acc_weighted",
                                                 dtype=tf.float64),
                ]
            },
        )
        model.summary()

        callbacks.append(
            TuneReportCheckpointCallback(metrics=[
                "adam_beta_1",
                "charge_loss",
                "cls_acc_unweighted",
                "cls_loss",
                "cos_phi_loss",
                "energy_loss",
                "eta_loss",
                "learning_rate",
                "loss",
                "pt_loss",
                "sin_phi_loss",
                "val_charge_loss",
                "val_cls_acc_unweighted",
                "val_cls_acc_weighted",
                "val_cls_loss",
                "val_cos_phi_loss",
                "val_energy_loss",
                "val_eta_loss",
                "val_loss",
                "val_pt_loss",
                "val_sin_phi_loss",
            ], ), )

        try:
            model.fit(
                ds_train.repeat(),
                validation_data=ds_test.repeat(),
                epochs=full_config["setup"]["num_epochs"],
                callbacks=callbacks,
                steps_per_epoch=num_train_steps,
                validation_steps=num_test_steps,
            )
        except tf.errors.ResourceExhaustedError:
            logging.warning(
                "Resource exhausted, skipping this hyperparameter configuration."
            )
            skiplog_file_path = Path(full_config["raytune"]["local_dir"]
                                     ) / name / "skipped_configurations.txt"
            lines = [
                "{}: {}\n".format(item[0], item[1]) for item in config.items()
            ]

            with open(skiplog_file_path, "a") as f:
                f.write("#" * 80 + "\n")
                for line in lines:
                    f.write(line)
                    logging.warning(line[:-1])
                f.write("#" * 80 + "\n\n")
Beispiel #3
0
def model_scope(config, total_steps, weights, horovod_enabled=False):
    lr_schedule, optim_callbacks, lr = get_lr_schedule(config,
                                                       steps=total_steps)
    opt = get_optimizer(config, lr_schedule)

    if config["setup"]["dtype"] == "float16":
        model_dtype = tf.dtypes.float16
        policy = mixed_precision.Policy("mixed_float16")
        mixed_precision.set_global_policy(policy)
        opt = mixed_precision.LossScaleOptimizer(opt)
    else:
        model_dtype = tf.dtypes.float32

    model = make_model(config, model_dtype)

    # Build the layers after the element and feature dimensions are specified
    model.build((1, config["dataset"]["padded_num_elem_size"],
                 config["dataset"]["num_input_features"]))

    initial_epoch = 0
    loaded_opt = None

    if weights:
        if lr_schedule:
            raise Exception(
                "Restoring the optimizer state with a learning rate schedule is currently not supported"
            )

        # We need to load the weights in the same trainable configuration as the model was set up
        configure_model_weights(model,
                                config["setup"].get("weights_config", "all"))
        model.load_weights(weights, by_name=True)
        opt_weight_file = weights.replace("hdf5",
                                          "pkl").replace("/weights-", "/opt-")
        if os.path.isfile(opt_weight_file):
            loaded_opt = pickle.load(open(opt_weight_file, "rb"))

        initial_epoch = int(weights.split("/")[-1].split("-")[1])
    model.build((1, config["dataset"]["padded_num_elem_size"],
                 config["dataset"]["num_input_features"]))

    config = set_config_loss(config, config["setup"]["trainable"])
    configure_model_weights(model, config["setup"]["trainable"])
    model.build((1, config["dataset"]["padded_num_elem_size"],
                 config["dataset"]["num_input_features"]))

    print("model weights")
    tw_names = [m.name for m in model.trainable_weights]
    for w in model.weights:
        print("layer={} trainable={} shape={} num_weights={}".format(
            w.name, w.name in tw_names, w.shape, np.prod(w.shape)))

    loss_dict, loss_weights = get_loss_dict(config)

    model.compile(
        loss=loss_dict,
        optimizer=opt,
        sample_weight_mode="temporal",
        loss_weights=loss_weights,
        metrics={
            "cls": [
                FlattenedCategoricalAccuracy(name="acc_unweighted",
                                             dtype=tf.float64),
                FlattenedCategoricalAccuracy(
                    use_weights=True, name="acc_weighted", dtype=tf.float64),
            ] + [
                SingleClassRecall(
                    icls, name="rec_cls{}".format(icls), dtype=tf.float64)
                for icls in range(config["dataset"]["num_output_classes"])
            ]
        },
    )

    model.summary()

    # Set the optimizer weights
    if loaded_opt:

        def model_weight_setting():
            grad_vars = model.trainable_weights
            zero_grads = [tf.zeros_like(w) for w in grad_vars]
            model.optimizer.apply_gradients(zip(zero_grads, grad_vars))
            if model.optimizer.__class__.__module__ == "keras.optimizers.optimizer_v1":
                model.optimizer.optimizer.optimizer.set_weights(
                    loaded_opt["weights"])
            else:
                model.optimizer.set_weights(loaded_opt["weights"])

        # FIXME: check that this still works with multiple GPUs
        strategy = tf.distribute.get_strategy()
        strategy.run(model_weight_setting)

    return model, optim_callbacks, initial_epoch
Beispiel #4
0
def train(config, weights, ntrain, ntest, nepochs, recreate, prefix, plot_freq,
          customize):

    try:
        from comet_ml import Experiment
        experiment = Experiment(
            project_name="particleflow-tf",
            auto_metric_logging=True,
            auto_param_logging=True,
            auto_histogram_weight_logging=True,
            auto_histogram_gradient_logging=False,
            auto_histogram_activation_logging=False,
        )
    except Exception as e:
        print("Failed to initialize comet-ml dashboard")
        experiment = None
    """Train a model defined by config"""
    config_file_path = config
    config, config_file_stem = parse_config(config,
                                            nepochs=nepochs,
                                            weights=weights)

    if plot_freq:
        config["callbacks"]["plot_freq"] = plot_freq

    if customize:
        config = customization_functions[customize](config)

    if recreate or (weights is None):
        outdir = create_experiment_dir(prefix=prefix + config_file_stem + "_",
                                       suffix=platform.node())
    else:
        outdir = str(Path(weights).parent)

    # Decide tf.distribute.strategy depending on number of available GPUs
    strategy, num_gpus = get_strategy()
    #if "CPU" not in strategy.extended.worker_devices[0]:
    #    nvidia_smi_call = "nvidia-smi --query-gpu=timestamp,name,pci.bus_id,pstate,power.draw,temperature.gpu,utilization.gpu,utilization.memory,memory.total,memory.free,memory.used --format=csv -l 1 -f {}/nvidia_smi_log.csv".format(outdir)
    #    p = subprocess.Popen(shlex.split(nvidia_smi_call))

    ds_train, num_train_steps = get_datasets(config["train_test_datasets"],
                                             config, num_gpus, "train")
    ds_test, num_test_steps = get_datasets(config["train_test_datasets"],
                                           config, num_gpus, "test")
    ds_val, ds_info = get_heptfds_dataset(
        config["validation_dataset"], config, num_gpus, "test",
        config["setup"]["num_events_validation"])
    ds_val = ds_val.batch(5)

    if ntrain:
        ds_train = ds_train.take(ntrain)
        num_train_steps = ntrain
    if ntest:
        ds_test = ds_test.take(ntest)
        num_test_steps = ntest

    print("num_train_steps", num_train_steps)
    print("num_test_steps", num_test_steps)
    total_steps = num_train_steps * config["setup"]["num_epochs"]
    print("total_steps", total_steps)

    if experiment:
        experiment.set_name(outdir)
        experiment.log_code("mlpf/tfmodel/model.py")
        experiment.log_code("mlpf/tfmodel/utils.py")
        experiment.log_code(config_file_path)

    shutil.copy(config_file_path, outdir + "/config.yaml"
                )  # Copy the config file to the train dir for later reference

    with strategy.scope():
        lr_schedule, optim_callbacks = get_lr_schedule(config,
                                                       steps=total_steps)
        opt = get_optimizer(config, lr_schedule)

        if config["setup"]["dtype"] == "float16":
            model_dtype = tf.dtypes.float16
            policy = mixed_precision.Policy("mixed_float16")
            mixed_precision.set_global_policy(policy)
            opt = mixed_precision.LossScaleOptimizer(opt)
        else:
            model_dtype = tf.dtypes.float32

        model = make_model(config, model_dtype)

        # Build the layers after the element and feature dimensions are specified
        model.build((1, config["dataset"]["padded_num_elem_size"],
                     config["dataset"]["num_input_features"]))

        initial_epoch = 0
        if weights:
            # We need to load the weights in the same trainable configuration as the model was set up
            configure_model_weights(
                model, config["setup"].get("weights_config", "all"))
            model.load_weights(weights, by_name=True)
            initial_epoch = int(weights.split("/")[-1].split("-")[1])
        model.build((1, config["dataset"]["padded_num_elem_size"],
                     config["dataset"]["num_input_features"]))

        config = set_config_loss(config, config["setup"]["trainable"])
        configure_model_weights(model, config["setup"]["trainable"])
        model.build((1, config["dataset"]["padded_num_elem_size"],
                     config["dataset"]["num_input_features"]))

        print("model weights")
        tw_names = [m.name for m in model.trainable_weights]
        for w in model.weights:
            print("layer={} trainable={} shape={} num_weights={}".format(
                w.name, w.name in tw_names, w.shape, np.prod(w.shape)))

        loss_dict, loss_weights = get_loss_dict(config)
        model.compile(
            loss=loss_dict,
            optimizer=opt,
            sample_weight_mode="temporal",
            loss_weights=loss_weights,
            metrics={
                "cls": [
                    FlattenedCategoricalAccuracy(name="acc_unweighted",
                                                 dtype=tf.float64),
                    FlattenedCategoricalAccuracy(use_weights=True,
                                                 name="acc_weighted",
                                                 dtype=tf.float64),
                ] + [
                    SingleClassRecall(
                        icls, name="rec_cls{}".format(icls), dtype=tf.float64)
                    for icls in range(config["dataset"]["num_output_classes"])
                ]
            },
        )
        model.summary()

    callbacks = prepare_callbacks(config["callbacks"],
                                  outdir,
                                  ds_val,
                                  ds_info,
                                  comet_experiment=experiment)
    callbacks.append(optim_callbacks)

    fit_result = model.fit(
        ds_train.repeat(),
        validation_data=ds_test.repeat(),
        epochs=initial_epoch + config["setup"]["num_epochs"],
        callbacks=callbacks,
        steps_per_epoch=num_train_steps,
        validation_steps=num_test_steps,
        initial_epoch=initial_epoch,
    )

    history_path = Path(outdir) / "history"
    history_path = str(history_path)
    with open("{}/history.json".format(history_path), "w") as fi:
        json.dump(fit_result.history, fi)

    weights = get_best_checkpoint(outdir)
    print("Loading best weights that could be found from {}".format(weights))
    model.load_weights(weights, by_name=True)

    model.save(outdir + "/model_full", save_format="tf")

    print("Training done.")