Beispiel #1
0
def run(config):
    hvd.init()

    # Threading configuration
    if os.environ.get("OMP_NUM_THREADS", None) is not None:
        logger.debug(f"OMP_NUM_THREADS is {os.environ.get('OMP_NUM_THREADS')}")
        num_intra = int(os.environ.get("OMP_NUM_THREADS"))
        tf.config.threading.set_intra_op_parallelism_threads(num_intra)
        tf.config.threading.set_inter_op_parallelism_threads(2)

    config["seed"]
    seed = config["seed"]
    if seed is not None:
        np.random.seed(seed)
        tf.random.set_seed(seed)

    load_config(config)

    # Scale batch size and learning rate according to the number of ranks
    batch_size = config[a.hyperparameters][a.batch_size] * hvd.size()
    learning_rate = config[a.hyperparameters][a.learning_rate] * hvd.size()
    logger.info(
        f"Scaled: 'batch_size' from {config[a.hyperparameters][a.batch_size]} to {batch_size} "
    )
    logger.info(
        f"Scaled: 'learning_rate' from {config[a.hyperparameters][a.learning_rate]} to {learning_rate} "
    )
    config[a.hyperparameters][a.batch_size] = batch_size
    config[a.hyperparameters][a.learning_rate] = learning_rate

    input_shape, output_shape = setup_data(config)

    search_space = setup_search_space(config, input_shape, output_shape, seed=seed)

    # Initialize Horovod

    model_created = False
    try:
        model = search_space.create_model()
        model_created = True
    except:
        logger.info("Error: Model creation failed...")
        logger.info(traceback.format_exc())

    if model_created:

        # Setup callbacks only
        callbacks = [
            # Horovod: broadcast initial variable states from rank 0 to all other processes.
            # This is necessary to ensure consistent initialization of all workers when
            # training is started with random weights or restored from a checkpoint.
            hvd.callbacks.BroadcastGlobalVariablesCallback(0),
            # Horovod: average metrics among workers at the end of every epoch.
            #
            # Note: This callback must be in the list before the ReduceLROnPlateau,
            # TensorBoard or other metrics-based callbacks.
            hvd.callbacks.MetricAverageCallback(),
            # Horovod: using `lr = 1.0 * hvd.size()` from the very beginning leads to worse final
            # accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during
            # the first five epochs. See https://arxiv.org/abs/1706.02677 for details.
            #! initial_lr argument is not available in horovod==0.19.0
            hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=5, verbose=0),
        ]

        cb_requires_valid = False  # Callbacks requires validation data
        callbacks_config = config[a.hyperparameters].get(a.callbacks, {})
        if callbacks_config is not None:
            for cb_name, cb_conf in callbacks_config.items():
                if cb_name in default_callbacks_config:
                    # cb_bame in hvd_root_cb implies hvd.rank() == 0
                    if not (cb_name in hvd_root_cb) or hvd.rank() == 0:
                        default_callbacks_config[cb_name].update(cb_conf)

                        # Import and create corresponding callback
                        Callback = import_callback(cb_name)
                        callbacks.append(Callback(**default_callbacks_config[cb_name]))

                        if cb_name in ["EarlyStopping"]:
                            cb_requires_valid = "val" in cb_conf["monitor"].split("_")
                else:
                    logger.error(f"'{cb_name}' is not an accepted callback!")

        trainer = HorovodTrainerTrainValid(config=config, model=model)

        trainer.callbacks.extend(callbacks)

        last_only, with_pred = preproc_trainer(config)
        last_only = last_only and not cb_requires_valid

        history = trainer.train(with_pred=with_pred, last_only=last_only)

        result = compute_objective(config["objective"], history)
    else:
        # penalising actions if model cannot be created
        result = -1
    if result < -10:
        result = -10
    return result
def run_distributed_base_trainer(config):

    physical_devices = tf.config.list_physical_devices("GPU")
    try:
        for i in range(len(physical_devices)):
            tf.config.experimental.set_memory_growth(physical_devices[i], True)
    except:
        # Invalid device or cannot modify virtual devices once initialized.
        pass

    distributed_strategy = tf.distribute.MirroredStrategy()
    n_replicas = distributed_strategy.num_replicas_in_sync

    seed = config["seed"]
    if seed is not None:
        np.random.seed(seed)
        tf.random.set_seed(seed)

    load_config(config)

    # Scale batch size and learning rate according to the number of ranks
    initial_lr = config[a.hyperparameters][a.learning_rate]
    if config[a.hyperparameters].get("lsr_batch_size"):
        batch_size = config[a.hyperparameters][a.batch_size] * n_replicas
    else:
        batch_size = config[a.hyperparameters][a.batch_size]
    if config[a.hyperparameters].get("lsr_learning_rate"):
        learning_rate = config[a.hyperparameters][a.learning_rate] * n_replicas
    else:
        learning_rate = config[a.hyperparameters][a.learning_rate]
    logger.info(
        f"Scaled: 'batch_size' from {config[a.hyperparameters][a.batch_size]} to {batch_size} "
    )
    logger.info(
        f"Scaled: 'learning_rate' from {config[a.hyperparameters][a.learning_rate]} to {learning_rate} "
    )
    config[a.hyperparameters][a.batch_size] = batch_size
    config[a.hyperparameters][a.learning_rate] = learning_rate

    input_shape, output_shape = setup_data(config)

    search_space = get_search_space(config,
                                    input_shape,
                                    output_shape,
                                    seed=seed)

    model_created = False
    with distributed_strategy.scope():
        try:
            model = search_space.sample(config["arch_seq"])
            model_created = True
        except:
            logger.info("Error: Model creation failed...")
            logger.info(traceback.format_exc())
        else:
            # Setup callbacks
            callbacks = []
            cb_requires_valid = False  # Callbacks requires validation data
            callbacks_config = config["hyperparameters"].get("callbacks")
            if callbacks_config is not None:
                for cb_name, cb_conf in callbacks_config.items():
                    if cb_name in default_callbacks_config:
                        default_callbacks_config[cb_name].update(cb_conf)

                        # Special dynamic parameters for callbacks
                        if cb_name == "ModelCheckpoint":
                            default_callbacks_config[cb_name][
                                "filepath"] = f'best_model_{config["id"]}.h5'

                        # replace patience hyperparameter
                        if "patience" in default_callbacks_config[cb_name]:
                            patience = config["hyperparameters"].get(
                                f"patience_{cb_name}")
                            if patience is not None:
                                default_callbacks_config[cb_name][
                                    "patience"] = patience

                        # Import and create corresponding callback
                        Callback = import_callback(cb_name)
                        callbacks.append(
                            Callback(**default_callbacks_config[cb_name]))

                        if cb_name in ["EarlyStopping"]:
                            cb_requires_valid = "val" in cb_conf[
                                "monitor"].split("_")
                    else:
                        logger.error(
                            f"'{cb_name}' is not an accepted callback!")

            # WarmupLR
            if config[a.hyperparameters].get("warmup_lr"):
                warmup_epochs = config[a.hyperparameters].get(
                    "warmup_epochs", 5)
                callbacks.append(
                    LearningRateWarmupCallback(
                        n_replicas=n_replicas,
                        warmup_epochs=warmup_epochs,
                        verbose=0,
                        initial_lr=initial_lr,
                    ))

            trainer = BaseTrainer(config=config, model=model)
            trainer.callbacks.extend(callbacks)

            last_only, with_pred = preproc_trainer(config)
            last_only = last_only and not cb_requires_valid

    if model_created:
        history = trainer.train(with_pred=with_pred, last_only=last_only)

        # save history
        save_history(config.get("log_dir", None), history, config)

        result = compute_objective(config["objective"], history)
    else:
        # penalising actions if model cannot be created
        result = -1
    if result < -10 or np.isnan(result):
        result = -10

    return result
Beispiel #3
0
def run(config) -> Dict[str, Any]:
    # Threading configuration
    if os.environ.get("OMP_NUM_THREADS", None) is not None:
        logger.debug(f"OMP_NUM_THREADS is {os.environ.get('OMP_NUM_THREADS')}")
        num_intra = int(os.environ.get("OMP_NUM_THREADS"))
        tf.config.threading.set_intra_op_parallelism_threads(num_intra)
        tf.config.threading.set_inter_op_parallelism_threads(2)

    seed = config["seed"]
    if seed is not None:
        np.random.seed(seed)
        tf.random.set_seed(seed)

    print('cwd in run (alpha.py) --- ', config['cwd'])
    os.chdir(config['cwd'])

    load_config(config)

    print(
        f'config  in alpha.py ------------------------------------- hyliu ---------------{socket.gethostname()} ------------------',
        config)

    input_shape, output_shape = setup_data(config)

    search_space = setup_search_space(config,
                                      input_shape,
                                      output_shape,
                                      seed=seed)

    model_created = False

    node_of_training = socket.gethostname()

    # logger.info(f'alpha.py run ---- config {config["id"]}')

    result_dict = {}
    try:
        model = search_space.create_model()
        model_created = True
    except:
        logger.info("Error: Model creation failed...")
        logger.error(traceback.format_exc())
        print(f'Error: Model creation failed... {traceback.format_exc()}')

    if model_created:
        # print('hyliu ---------------- config -------------', config)

        model_id = '___'.join([str(i) for i in config['arch_seq']])
        # Setup callbacks
        callbacks = []
        cb_requires_valid = False  # Callbacks requires validation data
        callbacks_config = config["hyperparameters"].get("callbacks")
        if callbacks_config is not None:
            for cb_name, cb_conf in callbacks_config.items():
                if cb_name in default_callbacks_config:
                    default_callbacks_config[cb_name].update(cb_conf)

                    # Special dynamic parameters for callbacks
                    if cb_name == "ModelCheckpoint":
                        default_callbacks_config[cb_name][
                            "filepath"] = f'best_model_{model_id}.h5'

                    # Import and create corresponding callback
                    Callback = import_callback(cb_name)
                    callbacks.append(
                        Callback(**default_callbacks_config[cb_name]))

                    if cb_name in ["EarlyStopping"]:
                        cb_requires_valid = "val" in cb_conf["monitor"].split(
                            "_")
                else:
                    logger.error(f"'{cb_name}' is not an accepted callback!")

        trainer = TrainerTrainValid(config=config, model=model)
        trainer.callbacks.extend(callbacks)

        last_only, with_pred = preproc_trainer(config)
        last_only = last_only and not cb_requires_valid

        print('where am I? in alpha.py ', os.getcwd())
        history = trainer.train(with_pred=with_pred, last_only=last_only)

        result = compute_objective(config["objective"], history)
        result_dict['history'] = history

        # print(f'result {result} in alpha.py')
    else:
        # penalising actions if model cannot be created
        result = -1
    if result < -10:
        result = -10

    result_dict['result'] = result
    result_dict['node'] = node_of_training

    return result_dict
Beispiel #4
0
def run(config: dict) -> float:
    # Threading configuration
    if os.environ.get("OMP_NUM_THREADS", None) is not None:
        logger.debug(f"OMP_NUM_THREADS is {os.environ.get('OMP_NUM_THREADS')}")
        num_intra = int(os.environ.get("OMP_NUM_THREADS"))
        tf.config.threading.set_intra_op_parallelism_threads(num_intra)
        tf.config.threading.set_inter_op_parallelism_threads(2)

    seed = config["seed"]
    if seed is not None:
        np.random.seed(seed)
        tf.random.set_seed(seed)

    load_config(config)

    input_shape, output_shape = setup_data(config)

    search_space = setup_search_space(config,
                                      input_shape,
                                      output_shape,
                                      seed=seed)

    model_created = False
    try:
        model = search_space.create_model()
        model_created = True
    except:
        logger.info("Error: Model creation failed...")
        logger.info(traceback.format_exc())

    if model_created:

        # Setup callbacks
        callbacks = []
        cb_requires_valid = False  # Callbacks requires validation data
        callbacks_config = config["hyperparameters"].get("callbacks")
        if callbacks_config is not None:
            for cb_name, cb_conf in callbacks_config.items():
                if cb_name in default_callbacks_config:
                    default_callbacks_config[cb_name].update(cb_conf)

                    # Special dynamic parameters for callbacks
                    if cb_name == "ModelCheckpoint":
                        default_callbacks_config[cb_name][
                            "filepath"] = f'best_model_{config["id"]}.h5'

                    # replace patience hyperparameter
                    if "patience" in default_callbacks_config[cb_name]:
                        patience = config["hyperparameters"].get(
                            f"patience_{cb_name}")
                        if patience is not None:
                            default_callbacks_config[cb_name][
                                "patience"] = patience

                    # Import and create corresponding callback
                    Callback = import_callback(cb_name)
                    callbacks.append(
                        Callback(**default_callbacks_config[cb_name]))

                    if cb_name in ["EarlyStopping"]:
                        cb_requires_valid = "val" in cb_conf["monitor"].split(
                            "_")
                else:
                    logger.error(f"'{cb_name}' is not an accepted callback!")

        trainer = TrainerTrainValid(config=config, model=model)
        trainer.callbacks.extend(callbacks)

        last_only, with_pred = preproc_trainer(config)
        last_only = last_only and not cb_requires_valid

        history = trainer.train(with_pred=with_pred, last_only=last_only)

        # save history
        save_history(config.get("log_dir", None), history, config)

        result = compute_objective(config["objective"], history)
    else:
        # penalising actions if model cannot be created
        result = -1
    if result < -10 or np.isnan(result):
        result = -10
    return result