Esempio n. 1
0
    def model_builder(hp):
        # config["parameters"]["combined_graph_layer"]["hidden_dim"] = hp.Choice("hidden_dim", values=[128])
        # config["parameters"]["combined_graph_layer"]["distance_dim"] = hp.Choice("distance_dim", values=[128])
        config["parameters"]["combined_graph_layer"][
            "num_node_messages"] = hp.Choice("num_node_messages", [1, 2])
        config["parameters"]["num_graph_layers_common"] = hp.Choice(
            "num_graph_layers_common", [1, 2, 3])
        config["parameters"]["num_graph_layers_energy"] = hp.Choice(
            "num_graph_layers_energy", [1, 2, 3])
        config["parameters"]["combined_graph_layer"]["dropout"] = hp.Choice(
            "cg_dropout", values=[0.0, 0.1, 0.2])
        config["parameters"]["output_decoding"]["dropout"] = hp.Choice(
            "output_dropout", values=[0.0, 0.1, 0.2])
        config["parameters"]["combined_graph_layer"]["bin_size"] = hp.Choice(
            "bin_size", values=[160, 320, 640])
        config["parameters"]["combined_graph_layer"][
            "ffn_dist_hidden_dim"] = hp.Choice("ffn_dist_hidden_dim",
                                               values=[64, 128, 256])
        config["parameters"]["combined_graph_layer"][
            "ffn_dist_num_layers"] = hp.Choice("ffn_dist_num_layers",
                                               values=[1, 2])
        config["parameters"]["combined_graph_layer"]["kernel"][
            "dist_mult"] = hp.Choice("dist_mult", values=[0.01, 0.1, 1.0])
        config["parameters"]["combined_graph_layer"]["node_message"][
            "output_dim"] = hp.Choice("output_dim", values=[128, 256, 512])
        config["parameters"]["combined_graph_layer"]["node_message"][
            "normalize_degrees"] = hp.Choice("normalize_degrees",
                                             values=[True, False])

        # config["setup"]["lr"] = hp.Choice("lr", values=[1e-4, 3e-4])
        # config["setup"]["lr_schedule"] = hp.Choice("lr_schedule", values=["exponentialdecay"])
        # config["setup"]["optimizer"] = hp.Choice("optimizer", values=["adam"])

        model = make_model(config, dtype="float32")
        model.build((1, config["dataset"]["padded_num_elem_size"],
                     config["dataset"]["num_input_features"]))

        lr_schedule, _ = get_lr_schedule(config, steps=total_steps)
        opt = get_optimizer(config, lr_schedule)

        loss_dict, loss_weights = get_loss_dict(config)
        model.compile(
            loss=loss_dict,
            optimizer=opt,
            sample_weight_mode="temporal",
            loss_weights=loss_weights,
            metrics={
                "cls": [
                    FlattenedCategoricalAccuracy(name="acc_unweighted",
                                                 dtype=tf.float64),
                    FlattenedCategoricalAccuracy(use_weights=True,
                                                 name="acc_weighted",
                                                 dtype=tf.float64),
                ]
            },
        )
        return model
Esempio n. 2
0
def evaluate(config, train_dir, weights, customize, nevents):
    """Evaluate the trained model in train_dir"""
    if config is None:
        config = Path(train_dir) / "config.yaml"
        assert config.exists(
        ), "Could not find config file in train_dir, please provide one with -c <path/to/config>"
    config, _ = parse_config(config, weights=weights)

    if customize:
        config = customization_functions[customize](config)

    if config["setup"]["dtype"] == "float16":
        model_dtype = tf.dtypes.float16
        policy = mixed_precision.Policy("mixed_float16")
        mixed_precision.set_global_policy(policy)
    else:
        model_dtype = tf.dtypes.float32

    strategy, num_gpus = get_strategy()
    # physical_devices = tf.config.list_physical_devices('GPU')
    # for dev in physical_devices:
    #    tf.config.experimental.set_memory_growth(dev, True)

    model = make_model(config, model_dtype)
    model.build((1, config["dataset"]["padded_num_elem_size"],
                 config["dataset"]["num_input_features"]))

    # need to load the weights in the same trainable configuration as the model was set up
    configure_model_weights(model, config["setup"].get("weights_config",
                                                       "all"))
    if weights:
        model.load_weights(weights, by_name=True)
    else:
        weights = get_best_checkpoint(train_dir)
        print(
            "Loading best weights that could be found from {}".format(weights))
        model.load_weights(weights, by_name=True)

    iepoch = int(weights.split("/")[-1].split("-")[1])

    for dsname in config["validation_datasets"]:
        ds_test, _ = get_heptfds_dataset(dsname,
                                         config,
                                         num_gpus,
                                         "test",
                                         supervised=False)
        if nevents:
            ds_test = ds_test.take(nevents)
        ds_test = ds_test.batch(5)
        eval_dir = str(
            Path(train_dir) / "evaluation" / "epoch_{}".format(iepoch) /
            dsname)
        Path(eval_dir).mkdir(parents=True, exist_ok=True)
        eval_model(model, ds_test, config, eval_dir)

    freeze_model(model, config, train_dir)
Esempio n. 3
0
def evaluate(config, train_dir, weights, evaluation_dir):
    """Evaluate the trained model in train_dir"""
    if config is None:
        config = Path(train_dir) / "config.yaml"
        assert config.exists(
        ), "Could not find config file in train_dir, please provide one with -c <path/to/config>"
    config, _ = parse_config(config, weights=weights)

    if evaluation_dir is None:
        eval_dir = str(Path(train_dir) / "evaluation")
    else:
        eval_dir = evaluation_dir

    Path(eval_dir).mkdir(parents=True, exist_ok=True)

    if config["setup"]["dtype"] == "float16":
        model_dtype = tf.dtypes.float16
        policy = mixed_precision.Policy("mixed_float16")
        mixed_precision.set_global_policy(policy)
        opt = mixed_precision.LossScaleOptimizer(opt)
    else:
        model_dtype = tf.dtypes.float32

    strategy, num_gpus = get_strategy()
    ds_test, _ = get_heptfds_dataset(config["validation_dataset"], config,
                                     num_gpus, "test")
    ds_test = ds_test.batch(5)

    model = make_model(config, model_dtype)
    model.build((1, config["dataset"]["padded_num_elem_size"],
                 config["dataset"]["num_input_features"]))

    # need to load the weights in the same trainable configuration as the model was set up
    configure_model_weights(model, config["setup"].get("weights_config",
                                                       "all"))
    if weights:
        model.load_weights(weights, by_name=True)
    else:
        weights = get_best_checkpoint(train_dir)
        print(
            "Loading best weights that could be found from {}".format(weights))
        model.load_weights(weights, by_name=True)

    eval_model(model, ds_test, config, eval_dir)
    freeze_model(model, config, ds_test.take(1), train_dir)
Esempio n. 4
0
def main(config):
    tf.config.run_functions_eagerly(False)
    config['setup']['multi_output'] = True
    model_pf = make_model(config, tf.float32)

    tb = tf.keras.callbacks.TensorBoard(
        log_dir="logs", histogram_freq=1, write_graph=False, write_images=False,
        update_freq='epoch',
        profile_batch=0,
    )
    tb.set_model(model_pf)

    cp_callback = tf.keras.callbacks.ModelCheckpoint(
        filepath="logs/weights-{epoch:02d}.hdf5",
        save_weights_only=True,
        verbose=0
    )
    cp_callback.set_model(model_pf)

    x = np.random.randn(1, config["dataset"]["padded_num_elem_size"], config["dataset"]["num_input_features"])
    ypred = concat_pf([model_pf(x), x])
    model_pf.load_weights("experiments/cms_20210909_132136_111774.gpu0.local/weights/weights-100-1.280379.hdf5", by_name=True)
    #model_pf.load_weights("./logs/weights-02.hdf5", by_name=True)

    model_disc = make_disc_model(config, ypred.shape[-1])

    num_gpus = 1
    ds = "cms_pf_ttbar"
    batch_size = 4
    config["datasets"][ds]["batch_per_gpu"] = batch_size
    ds_train, ds_info = get_heptfds_dataset(ds, config, num_gpus, "train", 128)
    ds_test, _ = get_heptfds_dataset(ds, config, num_gpus, "test", 128)
    ds_val, _ = get_heptfds_dataset(ds, config, num_gpus, "test", 128)

    cb = CustomCallback(
        "logs",
        ds_val,
        ds_info, plot_freq=10)

    cb.set_model(model_pf)

    input_elems = tf.keras.layers.Input(
        shape=(config["dataset"]["padded_num_elem_size"], config["dataset"]["num_input_features"]),
        batch_size=2*batch_size,
        name="input_detector_elements"
    )
    input_reco = tf.keras.layers.Input(
        shape=(config["dataset"]["padded_num_elem_size"], ypred.shape[-1]), name="input_reco_particles")
    pf_out = tf.keras.layers.Lambda(concat_pf)([model_pf(input_elems), input_elems])
    disc_out1 = model_disc([input_elems, pf_out])
    disc_out2 = model_disc([input_elems, input_reco])
    m1 = tf.keras.models.Model(inputs=[input_elems], outputs=[disc_out1], name="model_mlpf_disc")
    m2 = tf.keras.models.Model(inputs=[input_elems, input_reco], outputs=[disc_out2], name="model_reco_disc")

    def loss(x,y):
        return tf.keras.losses.binary_crossentropy(x,y, from_logits=True)

    #The MLPF reconstruction model (generator) is optimized to confuse the discriminator
    optimizer1 = tf.keras.optimizers.Adam(lr=1e-5)
    model_disc.trainable = False
    m1.compile(loss=loss, optimizer=optimizer1)
    m1.summary()

    #The discriminator model (adversarial) is optimized to distinguish between the true target and MLPF-reconstructed events
    optimizer2 = tf.keras.optimizers.Adam(lr=1e-5)
    model_pf.trainable = False
    model_disc.trainable = True
    m2.compile(loss=loss, optimizer=optimizer2)
    m2.summary()

    epochs = 1000

    ibatch = 0
    for epoch in range(epochs):
        loss_tot1 = 0.0
        loss_tot2 = 0.0
        loss_tot1_test = 0.0
        loss_tot2_test = 0.0


        for step, (xb, yb, wb) in tqdm(enumerate(ds_train), desc="Training"):

            msk_x = tf.cast(xb[:, :, 0:1]!=0, tf.float32)

            yp = concat_pf([model_pf(xb, training=True), xb])
            yb = concat_pf([yb, xb])

            yb = yb*msk_x

            #Train the discriminative (adversarial) model
            #true target particles have a classification target of 1, MLPF reconstructed a target of 0
            mlpf_train_inputs = tf.concat([xb, xb], axis=0)
            # mlpf_train_inputs = mlpf_train_inputs + tf.random.normal(mlpf_train_inputs.shape, stddev=0.0001)

            mlpf_train_outputs = tf.concat([yb, yp], axis=0)
            mlpf_train_disc_targets = tf.concat([batch_size*[0.99], batch_size*[0.01]], axis=0)
            loss2 = m2.train_on_batch([mlpf_train_inputs, mlpf_train_outputs], mlpf_train_disc_targets)

            #Train the MLPF reconstruction (generative) model with an inverted target
            disc_train_disc_targets = tf.concat([batch_size*[1.0]], axis=0)
            loss1 = m1.train_on_batch(xb, disc_train_disc_targets)

            loss_tot1 += loss1
            loss_tot2 += loss2
            ibatch += 1

        import boost_histogram as bh
        import mplhep
        import matplotlib.pyplot as plt

        preds_0 = []
        preds_1 = []

        for step, (xb, yb, wb) in tqdm(enumerate(ds_test), desc="Testing"):
            msk_x = tf.cast(xb[:, :, 0:1]!=0, tf.float32)

            yp = concat_pf([model_pf(xb, training=False), xb])
            yb = concat_pf([yb, xb])

            yb = yb*msk_x

            #Train the discriminative (adversarial) model
            #true target particles have a classification target of 1, MLPF reconstructed a target of 0
            mlpf_train_inputs = tf.concat([xb, xb], axis=0)
            mlpf_train_outputs = tf.concat([yb, yp], axis=0)
            mlpf_train_disc_targets = tf.concat([batch_size*[0.99], batch_size*[0.01]], axis=0)
            loss2 = m2.test_on_batch([mlpf_train_inputs, mlpf_train_outputs], mlpf_train_disc_targets)

            #Train the MLPF reconstruction (generative) model with an inverted target
            disc_train_disc_targets = tf.concat([batch_size*[1.0]], axis=0)
            loss1 = m1.test_on_batch(xb, disc_train_disc_targets)

            p = m2.predict_on_batch([mlpf_train_inputs, mlpf_train_outputs])
            preds_0 += list(p[mlpf_train_disc_targets<0.5, 0])
            preds_1 += list(p[mlpf_train_disc_targets>=0.5, 0])

            loss_tot1_test += loss1
            loss_tot2_test += loss2

        print("Epoch {}, l1={:.5E}/{:.5E}, l2={:.5E}/{:.5E}".format(epoch, loss_tot1, loss_tot1_test, loss_tot2, loss_tot2_test))

        #Draw histograms of the discriminator outputs for monitoring
        minval = np.min(preds_0 + preds_1)
        maxval = np.max(preds_0 + preds_1)
        h0 = bh.Histogram(bh.axis.Regular(50, minval, maxval))
        h1 = bh.Histogram(bh.axis.Regular(50, minval, maxval))
        h0.fill(preds_0)
        h1.fill(preds_1)

        fig = plt.figure(figsize=(4,4))
        mplhep.histplot(h0, label="MLPF")
        mplhep.histplot(h1, label="Target")
        plt.xlabel("Adversarial classification output")
        plt.legend(loc="best", frameon=False)
        plt.savefig("logs/disc_{}.pdf".format(epoch), bbox_inches="tight")
        plt.close("all")

        tb.on_epoch_end(epoch, {
            "loss1": loss_tot1,
            "loss2": loss_tot2,
            "val_loss1": loss_tot1_test,
            "val_loss2": loss_tot2_test,
            "val_mean_p0": np.mean(preds_0),
            "val_std_p0": np.std(preds_0),
            "val_mean_p1": np.mean(preds_1),
            "val_std_p1": np.std(preds_1),
        })

        cp_callback.on_epoch_end(epoch)
        cb.on_epoch_end(epoch)
Esempio n. 5
0
def build_model_and_train(config,
                          checkpoint_dir=None,
                          full_config=None,
                          ntrain=None,
                          ntest=None,
                          name=None,
                          seeds=False):
    from ray import tune
    from ray.tune.integration.keras import TuneReportCheckpointCallback
    from raytune.search_space import set_raytune_search_parameters

    if seeds:
        # Set seeds for reproducibility
        random.seed(1234)
        np.random.seed(1234)
        tf.random.set_seed(1234)

    full_config, config_file_stem = parse_config(full_config)

    if config is not None:
        full_config = set_raytune_search_parameters(search_space=config,
                                                    config=full_config)

    strategy, num_gpus = get_strategy()

    ds_train, num_train_steps = get_datasets(
        full_config["train_test_datasets"], full_config, num_gpus, "train")
    ds_test, num_test_steps = get_datasets(full_config["train_test_datasets"],
                                           full_config, num_gpus, "test")
    ds_val, ds_info = get_heptfds_dataset(
        full_config["validation_datasets"][0],
        full_config,
        num_gpus,
        "test",
        full_config["setup"]["num_events_validation"],
        supervised=False,
    )
    ds_val = ds_val.batch(5)

    if ntrain:
        ds_train = ds_train.take(ntrain)
        num_train_steps = ntrain
    if ntest:
        ds_test = ds_test.take(ntest)
        num_test_steps = ntest

    print("num_train_steps", num_train_steps)
    print("num_test_steps", num_test_steps)
    total_steps = num_train_steps * full_config["setup"]["num_epochs"]
    print("total_steps", total_steps)

    callbacks = prepare_callbacks(
        full_config,
        tune.get_trial_dir(),
        ds_val,
    )

    callbacks = callbacks[:
                          -1]  # remove the CustomCallback at the end of the list

    with strategy.scope():
        lr_schedule, optim_callbacks = get_lr_schedule(full_config,
                                                       steps=total_steps)
        callbacks.append(optim_callbacks)
        opt = get_optimizer(full_config, lr_schedule)

        model = make_model(full_config, dtype=tf.dtypes.float32)

        # Run model once to build the layers
        model.build((1, full_config["dataset"]["padded_num_elem_size"],
                     full_config["dataset"]["num_input_features"]))

        full_config = set_config_loss(full_config,
                                      full_config["setup"]["trainable"])
        configure_model_weights(model, full_config["setup"]["trainable"])
        model.build((1, full_config["dataset"]["padded_num_elem_size"],
                     full_config["dataset"]["num_input_features"]))

        loss_dict, loss_weights = get_loss_dict(full_config)
        model.compile(
            loss=loss_dict,
            optimizer=opt,
            sample_weight_mode="temporal",
            loss_weights=loss_weights,
            metrics={
                "cls": [
                    FlattenedCategoricalAccuracy(name="acc_unweighted",
                                                 dtype=tf.float64),
                    FlattenedCategoricalAccuracy(use_weights=True,
                                                 name="acc_weighted",
                                                 dtype=tf.float64),
                ]
            },
        )
        model.summary()

        callbacks.append(
            TuneReportCheckpointCallback(metrics=[
                "adam_beta_1",
                "charge_loss",
                "cls_acc_unweighted",
                "cls_loss",
                "cos_phi_loss",
                "energy_loss",
                "eta_loss",
                "learning_rate",
                "loss",
                "pt_loss",
                "sin_phi_loss",
                "val_charge_loss",
                "val_cls_acc_unweighted",
                "val_cls_acc_weighted",
                "val_cls_loss",
                "val_cos_phi_loss",
                "val_energy_loss",
                "val_eta_loss",
                "val_loss",
                "val_pt_loss",
                "val_sin_phi_loss",
            ], ), )

        try:
            model.fit(
                ds_train.repeat(),
                validation_data=ds_test.repeat(),
                epochs=full_config["setup"]["num_epochs"],
                callbacks=callbacks,
                steps_per_epoch=num_train_steps,
                validation_steps=num_test_steps,
            )
        except tf.errors.ResourceExhaustedError:
            logging.warning(
                "Resource exhausted, skipping this hyperparameter configuration."
            )
            skiplog_file_path = Path(full_config["raytune"]["local_dir"]
                                     ) / name / "skipped_configurations.txt"
            lines = [
                "{}: {}\n".format(item[0], item[1]) for item in config.items()
            ]

            with open(skiplog_file_path, "a") as f:
                f.write("#" * 80 + "\n")
                for line in lines:
                    f.write(line)
                    logging.warning(line[:-1])
                f.write("#" * 80 + "\n\n")
Esempio n. 6
0
def find_lr(config, outdir, figname, logscale):
    """Run the Learning Rate Finder to produce a batch loss vs. LR plot from
    which an appropriate LR-range can be determined"""
    config, _ = parse_config(config)

    # Decide tf.distribute.strategy depending on number of available GPUs
    strategy, num_gpus = get_strategy()

    ds_train, num_train_steps = get_datasets(config["train_test_datasets"],
                                             config, num_gpus, "train")

    with strategy.scope():
        opt = tf.keras.optimizers.Adam(
            learning_rate=1e-7
        )  # This learning rate will be changed by the lr_finder
        if config["setup"]["dtype"] == "float16":
            model_dtype = tf.dtypes.float16
            policy = mixed_precision.Policy("mixed_float16")
            mixed_precision.set_global_policy(policy)
            opt = mixed_precision.LossScaleOptimizer(opt)
        else:
            model_dtype = tf.dtypes.float32

        model = make_model(config, model_dtype)
        config = set_config_loss(config, config["setup"]["trainable"])

        # Run model once to build the layers
        model.build((1, config["dataset"]["padded_num_elem_size"],
                     config["dataset"]["num_input_features"]))

        configure_model_weights(model, config["setup"]["trainable"])

        loss_dict, loss_weights = get_loss_dict(config)
        model.compile(
            loss=loss_dict,
            optimizer=opt,
            sample_weight_mode="temporal",
            loss_weights=loss_weights,
            metrics={
                "cls": [
                    FlattenedCategoricalAccuracy(name="acc_unweighted",
                                                 dtype=tf.float64),
                    FlattenedCategoricalAccuracy(use_weights=True,
                                                 name="acc_weighted",
                                                 dtype=tf.float64),
                ]
            },
        )
        model.summary()

        max_steps = 200
        lr_finder = LRFinder(max_steps=max_steps)
        callbacks = [lr_finder]

        model.fit(
            ds_train.repeat(),
            epochs=max_steps,
            callbacks=callbacks,
            steps_per_epoch=1,
        )

        lr_finder.plot(save_dir=outdir, figname=figname, log_scale=logscale)
Esempio n. 7
0
def compute_validation_loss(config, train_dir, weights):
    """Evaluate the trained model in train_dir"""
    if config is None:
        config = Path(train_dir) / "config.yaml"
        assert config.exists(
        ), "Could not find config file in train_dir, please provide one with -c <path/to/config>"
    config, _ = parse_config(config, weights=weights)

    if config["setup"]["dtype"] == "float16":
        model_dtype = tf.dtypes.float16
        policy = mixed_precision.Policy("mixed_float16")
        mixed_precision.set_global_policy(policy)
    else:
        model_dtype = tf.dtypes.float32

    strategy, num_gpus = get_strategy()
    ds_test, num_test_steps = get_datasets(config["train_test_datasets"],
                                           config, num_gpus, "test")

    with strategy.scope():
        model = make_model(config, model_dtype)
        model.build((1, config["dataset"]["padded_num_elem_size"],
                     config["dataset"]["num_input_features"]))

        # need to load the weights in the same trainable configuration as the model was set up
        configure_model_weights(model,
                                config["setup"].get("weights_config", "all"))
        if weights:
            model.load_weights(weights, by_name=True)
        else:
            weights = get_best_checkpoint(train_dir)
            print("Loading best weights that could be found from {}".format(
                weights))
            model.load_weights(weights, by_name=True)

        loss_dict, loss_weights = get_loss_dict(config)
        model.compile(
            loss=loss_dict,
            # sample_weight_mode="temporal",
            loss_weights=loss_weights,
            metrics={
                "cls": [
                    FlattenedCategoricalAccuracy(name="acc_unweighted",
                                                 dtype=tf.float64),
                    FlattenedCategoricalAccuracy(use_weights=True,
                                                 name="acc_weighted",
                                                 dtype=tf.float64),
                ] + [
                    SingleClassRecall(
                        icls, name="rec_cls{}".format(icls), dtype=tf.float64)
                    for icls in range(config["dataset"]["num_output_classes"])
                ]
            },
        )

        losses = model.evaluate(
            x=ds_test,
            steps=num_test_steps,
            return_dict=True,
        )
    with open("{}/losses.txt".format(train_dir), "w") as loss_file:
        loss_file.write(json.dumps(losses) + "\n")
Esempio n. 8
0
def model_scope(config, total_steps, weights, horovod_enabled=False):
    lr_schedule, optim_callbacks, lr = get_lr_schedule(config,
                                                       steps=total_steps)
    opt = get_optimizer(config, lr_schedule)

    if config["setup"]["dtype"] == "float16":
        model_dtype = tf.dtypes.float16
        policy = mixed_precision.Policy("mixed_float16")
        mixed_precision.set_global_policy(policy)
        opt = mixed_precision.LossScaleOptimizer(opt)
    else:
        model_dtype = tf.dtypes.float32

    model = make_model(config, model_dtype)

    # Build the layers after the element and feature dimensions are specified
    model.build((1, config["dataset"]["padded_num_elem_size"],
                 config["dataset"]["num_input_features"]))

    initial_epoch = 0
    loaded_opt = None

    if weights:
        if lr_schedule:
            raise Exception(
                "Restoring the optimizer state with a learning rate schedule is currently not supported"
            )

        # We need to load the weights in the same trainable configuration as the model was set up
        configure_model_weights(model,
                                config["setup"].get("weights_config", "all"))
        model.load_weights(weights, by_name=True)
        opt_weight_file = weights.replace("hdf5",
                                          "pkl").replace("/weights-", "/opt-")
        if os.path.isfile(opt_weight_file):
            loaded_opt = pickle.load(open(opt_weight_file, "rb"))

        initial_epoch = int(weights.split("/")[-1].split("-")[1])
    model.build((1, config["dataset"]["padded_num_elem_size"],
                 config["dataset"]["num_input_features"]))

    config = set_config_loss(config, config["setup"]["trainable"])
    configure_model_weights(model, config["setup"]["trainable"])
    model.build((1, config["dataset"]["padded_num_elem_size"],
                 config["dataset"]["num_input_features"]))

    print("model weights")
    tw_names = [m.name for m in model.trainable_weights]
    for w in model.weights:
        print("layer={} trainable={} shape={} num_weights={}".format(
            w.name, w.name in tw_names, w.shape, np.prod(w.shape)))

    loss_dict, loss_weights = get_loss_dict(config)

    model.compile(
        loss=loss_dict,
        optimizer=opt,
        sample_weight_mode="temporal",
        loss_weights=loss_weights,
        metrics={
            "cls": [
                FlattenedCategoricalAccuracy(name="acc_unweighted",
                                             dtype=tf.float64),
                FlattenedCategoricalAccuracy(
                    use_weights=True, name="acc_weighted", dtype=tf.float64),
            ] + [
                SingleClassRecall(
                    icls, name="rec_cls{}".format(icls), dtype=tf.float64)
                for icls in range(config["dataset"]["num_output_classes"])
            ]
        },
    )

    model.summary()

    # Set the optimizer weights
    if loaded_opt:

        def model_weight_setting():
            grad_vars = model.trainable_weights
            zero_grads = [tf.zeros_like(w) for w in grad_vars]
            model.optimizer.apply_gradients(zip(zero_grads, grad_vars))
            if model.optimizer.__class__.__module__ == "keras.optimizers.optimizer_v1":
                model.optimizer.optimizer.optimizer.set_weights(
                    loaded_opt["weights"])
            else:
                model.optimizer.set_weights(loaded_opt["weights"])

        # FIXME: check that this still works with multiple GPUs
        strategy = tf.distribute.get_strategy()
        strategy.run(model_weight_setting)

    return model, optim_callbacks, initial_epoch
Esempio n. 9
0
def train(config, weights, ntrain, ntest, nepochs, recreate, prefix, plot_freq,
          customize):

    try:
        from comet_ml import Experiment
        experiment = Experiment(
            project_name="particleflow-tf",
            auto_metric_logging=True,
            auto_param_logging=True,
            auto_histogram_weight_logging=True,
            auto_histogram_gradient_logging=False,
            auto_histogram_activation_logging=False,
        )
    except Exception as e:
        print("Failed to initialize comet-ml dashboard")
        experiment = None
    """Train a model defined by config"""
    config_file_path = config
    config, config_file_stem = parse_config(config,
                                            nepochs=nepochs,
                                            weights=weights)

    if plot_freq:
        config["callbacks"]["plot_freq"] = plot_freq

    if customize:
        config = customization_functions[customize](config)

    if recreate or (weights is None):
        outdir = create_experiment_dir(prefix=prefix + config_file_stem + "_",
                                       suffix=platform.node())
    else:
        outdir = str(Path(weights).parent)

    # Decide tf.distribute.strategy depending on number of available GPUs
    strategy, num_gpus = get_strategy()
    #if "CPU" not in strategy.extended.worker_devices[0]:
    #    nvidia_smi_call = "nvidia-smi --query-gpu=timestamp,name,pci.bus_id,pstate,power.draw,temperature.gpu,utilization.gpu,utilization.memory,memory.total,memory.free,memory.used --format=csv -l 1 -f {}/nvidia_smi_log.csv".format(outdir)
    #    p = subprocess.Popen(shlex.split(nvidia_smi_call))

    ds_train, num_train_steps = get_datasets(config["train_test_datasets"],
                                             config, num_gpus, "train")
    ds_test, num_test_steps = get_datasets(config["train_test_datasets"],
                                           config, num_gpus, "test")
    ds_val, ds_info = get_heptfds_dataset(
        config["validation_dataset"], config, num_gpus, "test",
        config["setup"]["num_events_validation"])
    ds_val = ds_val.batch(5)

    if ntrain:
        ds_train = ds_train.take(ntrain)
        num_train_steps = ntrain
    if ntest:
        ds_test = ds_test.take(ntest)
        num_test_steps = ntest

    print("num_train_steps", num_train_steps)
    print("num_test_steps", num_test_steps)
    total_steps = num_train_steps * config["setup"]["num_epochs"]
    print("total_steps", total_steps)

    if experiment:
        experiment.set_name(outdir)
        experiment.log_code("mlpf/tfmodel/model.py")
        experiment.log_code("mlpf/tfmodel/utils.py")
        experiment.log_code(config_file_path)

    shutil.copy(config_file_path, outdir + "/config.yaml"
                )  # Copy the config file to the train dir for later reference

    with strategy.scope():
        lr_schedule, optim_callbacks = get_lr_schedule(config,
                                                       steps=total_steps)
        opt = get_optimizer(config, lr_schedule)

        if config["setup"]["dtype"] == "float16":
            model_dtype = tf.dtypes.float16
            policy = mixed_precision.Policy("mixed_float16")
            mixed_precision.set_global_policy(policy)
            opt = mixed_precision.LossScaleOptimizer(opt)
        else:
            model_dtype = tf.dtypes.float32

        model = make_model(config, model_dtype)

        # Build the layers after the element and feature dimensions are specified
        model.build((1, config["dataset"]["padded_num_elem_size"],
                     config["dataset"]["num_input_features"]))

        initial_epoch = 0
        if weights:
            # We need to load the weights in the same trainable configuration as the model was set up
            configure_model_weights(
                model, config["setup"].get("weights_config", "all"))
            model.load_weights(weights, by_name=True)
            initial_epoch = int(weights.split("/")[-1].split("-")[1])
        model.build((1, config["dataset"]["padded_num_elem_size"],
                     config["dataset"]["num_input_features"]))

        config = set_config_loss(config, config["setup"]["trainable"])
        configure_model_weights(model, config["setup"]["trainable"])
        model.build((1, config["dataset"]["padded_num_elem_size"],
                     config["dataset"]["num_input_features"]))

        print("model weights")
        tw_names = [m.name for m in model.trainable_weights]
        for w in model.weights:
            print("layer={} trainable={} shape={} num_weights={}".format(
                w.name, w.name in tw_names, w.shape, np.prod(w.shape)))

        loss_dict, loss_weights = get_loss_dict(config)
        model.compile(
            loss=loss_dict,
            optimizer=opt,
            sample_weight_mode="temporal",
            loss_weights=loss_weights,
            metrics={
                "cls": [
                    FlattenedCategoricalAccuracy(name="acc_unweighted",
                                                 dtype=tf.float64),
                    FlattenedCategoricalAccuracy(use_weights=True,
                                                 name="acc_weighted",
                                                 dtype=tf.float64),
                ] + [
                    SingleClassRecall(
                        icls, name="rec_cls{}".format(icls), dtype=tf.float64)
                    for icls in range(config["dataset"]["num_output_classes"])
                ]
            },
        )
        model.summary()

    callbacks = prepare_callbacks(config["callbacks"],
                                  outdir,
                                  ds_val,
                                  ds_info,
                                  comet_experiment=experiment)
    callbacks.append(optim_callbacks)

    fit_result = model.fit(
        ds_train.repeat(),
        validation_data=ds_test.repeat(),
        epochs=initial_epoch + config["setup"]["num_epochs"],
        callbacks=callbacks,
        steps_per_epoch=num_train_steps,
        validation_steps=num_test_steps,
        initial_epoch=initial_epoch,
    )

    history_path = Path(outdir) / "history"
    history_path = str(history_path)
    with open("{}/history.json".format(history_path), "w") as fi:
        json.dump(fit_result.history, fi)

    weights = get_best_checkpoint(outdir)
    print("Loading best weights that could be found from {}".format(weights))
    model.load_weights(weights, by_name=True)

    model.save(outdir + "/model_full", save_format="tf")

    print("Training done.")