Beispiel #1
0
def compress(*, input: str, output: str, extrapolate: int, step: float,
             frequency: str, checkpoint_folder: str, training_script: str,
             mpi_log: str, log_path: Optional[str], log_level: int, **kwargs):
    """Compress model.

    The table is composed of fifth-order polynomial coefficients and is assembled from
    two sub-tables. The first table takes the step parameter as the domain's uniform step size,
    while the second table takes 10 * step as it's uniform step size. The range of the
    first table is automatically detected by the code, while the second table ranges
    from the first table's upper boundary(upper) to the extrapolate(parameter) * upper.

    Parameters
    ----------
    input : str
        frozen model file to compress
    output : str
        compressed model filename
    extrapolate : int
        scale of model extrapolation
    step : float
        uniform step size of the tabulation's first table
    frequency : str
        frequency of tabulation overflow check
    checkpoint_folder : str
        trining checkpoint folder for freezing
    training_script : str
        training script of the input frozen model
    mpi_log : str
        mpi logging mode for training
    log_path : Optional[str]
        if speccified log will be written to this file
    log_level : int
        logging level
    """
    try:
        t_jdata = get_tensor_by_name(input, 'train_attr/training_script')
        t_min_nbor_dist = get_tensor_by_name(input, 'train_attr/min_nbor_dist')
        jdata = json.loads(t_jdata)
    except GraphWithoutTensorError as e:
        if training_script == None:
            raise RuntimeError(
                "The input frozen model: %s has no training script or min_nbor_dist information, "
                "which is not supported by the model compression interface. "
                "Please consider using the --training-script command within the model compression interface to provide the training script of the input frozen model. "
                "Note that the input training script must contain the correct path to the training data."
                % input) from e
        elif not os.path.exists(training_script):
            raise RuntimeError(
                "The input training script %s (%s) does not exist! Please check the path of the training script. "
                % (input, os.path.abspath(input))) from e
        else:
            log.info("stage 0: compute the min_nbor_dist")
            jdata = j_loader(training_script)
            jdata = update_deepmd_input(jdata)
            t_min_nbor_dist = get_min_nbor_dist(jdata, get_rcut(jdata))

    _check_compress_type(input)

    tf.constant(t_min_nbor_dist,
                name='train_attr/min_nbor_dist',
                dtype=GLOBAL_ENER_FLOAT_PRECISION)
    jdata["model"]["compress"] = {}
    jdata["model"]["compress"]["model_file"] = input
    jdata["model"]["compress"]["min_nbor_dist"] = t_min_nbor_dist
    jdata["model"]["compress"]["table_config"] = [
        extrapolate,
        step,
        10 * step,
        int(frequency),
    ]
    jdata["training"]["save_ckpt"] = "model-compression/model.ckpt"
    jdata = update_deepmd_input(jdata)
    jdata = normalize(jdata)

    # check the descriptor info of the input file
    # move to the specific Descriptor class

    # stage 1: training or refining the model with tabulation
    log.info("\n\n")
    log.info("stage 1: compress the model")
    control_file = "compress.json"
    with open(control_file, "w") as fp:
        json.dump(jdata, fp, indent=4)
    try:
        train(
            INPUT=control_file,
            init_model=None,
            restart=None,
            init_frz_model=None,
            output=control_file,
            mpi_log=mpi_log,
            log_level=log_level,
            log_path=log_path,
            is_compress=True,
        )
    except GraphTooLargeError as e:
        raise RuntimeError(
            "The uniform step size of the tabulation's first table is %f, "
            "which is too small. This leads to a very large graph size, "
            "exceeding protobuf's limitation (2 GB). You should try to "
            "increase the step size." % step) from e

    # stage 2: freeze the model
    log.info("\n\n")
    log.info("stage 2: freeze the model")
    try:
        freeze(checkpoint_folder=checkpoint_folder,
               output=output,
               node_names=None)
    except GraphTooLargeError as e:
        raise RuntimeError(
            "The uniform step size of the tabulation's first table is %f, "
            "which is too small. This leads to a very large graph size, "
            "exceeding protobuf's limitation (2 GB). You should try to "
            "increase the step size." % step) from e
Beispiel #2
0
def train(
    *,
    INPUT: str,
    init_model: Optional[str],
    restart: Optional[str],
    output: str,
    init_frz_model: str,
    mpi_log: str,
    log_level: int,
    log_path: Optional[str],
    is_compress: bool = False,
    skip_neighbor_stat: bool = False,
    **kwargs,
):
    """Run DeePMD model training.

    Parameters
    ----------
    INPUT : str
        json/yaml control file
    init_model : Optional[str]
        path to checkpoint folder or None
    restart : Optional[str]
        path to checkpoint folder or None
    output : str
        path for dump file with arguments
    init_frz_model : str
        path to frozen model or None
    mpi_log : str
        mpi logging mode
    log_level : int
        logging level defined by int 0-3
    log_path : Optional[str]
        logging file path or None if logs are to be output only to stdout
    is_compress: bool
        indicates whether in the model compress mode
    skip_neighbor_stat : bool, default=False
        skip checking neighbor statistics

    Raises
    ------
    RuntimeError
        if distributed training job nem is wrong
    """
    run_opt = RunOptions(
        init_model=init_model,
        restart=restart,
        init_frz_model=init_frz_model,
        log_path=log_path,
        log_level=log_level,
        mpi_log=mpi_log
    )
    if run_opt.is_distrib and len(run_opt.gpus or []) > 1:
        # avoid conflict of visible gpus among multipe tf sessions in one process
        reset_default_tf_session_config(cpu_only=True)

    # load json database
    jdata = j_loader(INPUT)

    jdata = update_deepmd_input(jdata, warning=True, dump="input_v2_compat.json")

    jdata = normalize(jdata)

    if not is_compress and not skip_neighbor_stat:
        jdata = update_sel(jdata)

    with open(output, "w") as fp:
        json.dump(jdata, fp, indent=4)

    # save the training script into the graph
    tf.constant(json.dumps(jdata), name='train_attr/training_script', dtype=tf.string)

    for message in WELCOME + CITATION + BUILD:
        log.info(message)

    run_opt.print_resource_summary()
    _do_work(jdata, run_opt, is_compress)