Ejemplo n.º 1
0
    def test_parse_overrides(self):
        assert parse_overrides("") == {}
        assert parse_overrides("{}") == {}

        override_dict = parse_overrides('{"train_data": "/train", "trainer.num_epochs": 10}')
        assert override_dict == {
            "train_data": "/train",
            "trainer": {
                "num_epochs": 10
            }
        }

        params = with_fallback(
            preferred=override_dict,
            fallback={
                "train_data": "/test",
                "model": "bidaf",
                "trainer": {"num_epochs": 100, "optimizer": "sgd"}
            })

        assert params == {
            "train_data": "/train",
            "model": "bidaf",
            "trainer": {"num_epochs": 10, "optimizer": "sgd"}
        }
Ejemplo n.º 2
0
    def test_parse_overrides(self):
        assert parse_overrides("") == {}
        assert parse_overrides("{}") == {}

        override_dict = parse_overrides('{"train_data": "/train", "trainer.num_epochs": 10}')
        assert override_dict == {
            "train_data": "/train",
            "trainer": {
                "num_epochs": 10
            }
        }

        params = with_fallback(
            preferred=override_dict,
            fallback={
                "train_data": "/test",
                "model": "bidaf",
                "trainer": {"num_epochs": 100, "optimizer": "sgd"}
            })

        assert params == {
            "train_data": "/train",
            "model": "bidaf",
            "trainer": {"num_epochs": 10, "optimizer": "sgd"}
        }
Ejemplo n.º 3
0
def yaml_to_params(params_file: str, overrides: str = "") -> Params:
    # redirect to cache, if necessary
    params_file = cached_path(params_file)

    with open(params_file) as f:
        file_dict = yaml.safe_load(f)

    overrides_dict = parse_overrides(overrides)
    param_dict = with_fallback(preferred=overrides_dict, fallback=file_dict)

    return Params(param_dict)
Ejemplo n.º 4
0
def load_params(param_file, overrides):
    """Param loader with YAML support."""
    if not param_file.endswith(('.yaml', '.yml')):
        return Params.from_file(param_file, overrides)

    with open(param_file) as f:
        file_dict = yaml.safe_load(f)

    overrides_dict = parse_overrides(overrides)
    param_dict = with_fallback(preferred=overrides_dict, fallback=file_dict)
    return Params(param_dict)
Ejemplo n.º 5
0
def _get_predictor(args: argparse.Namespace) -> Predictor:
    check_for_gpu(args.cuda_device)
    archive = load_archive(args.archive_file,
                           weights_file=args.weights_file,
                           cuda_device=args.cuda_device,
                           overrides=args.overrides)
    ov = parse_overrides(args.overrides)
    paper_features_path = None
    try:
        paper_features_path = ov['dataset_reader']['paper_features_path']
    except KeyError:
        pass
    return predictor_from_archive(archive, args.predictor, paper_features_path)
Ejemplo n.º 6
0
def load_archive_from_folder(archive_file: str,
                             cuda_device: int = -1,
                             overrides: str = "",
                             weights_file: str = None) -> Archive:
    # redirect to the cache, if necessary
    resolved_archive_file = cached_path(archive_file)

    logger.info(f"loading model from direactory {archive_file}")

    serialization_dir = resolved_archive_file

    # Check for supplemental files in archive
    fta_filename = os.path.join(serialization_dir, _FTA_NAME)
    if os.path.exists(fta_filename):
        with open(fta_filename, 'r') as fta_file:
            files_to_archive = json.loads(fta_file.read())

        # Add these replacements to overrides
        replacements_dict: Dict[str, Any] = {}
        for key, filename  in files_to_archive.items():
            if not filename.startswith("/"):
                filename = os.path.join(serialization_dir, f"fta/{key}")
            replacements_dict[key] = filename

        overrides_dict = parse_overrides(overrides)
        combined_dict = with_fallback(preferred=unflatten(replacements_dict), fallback=overrides_dict)
        overrides = json.dumps(combined_dict)

    # Load config
    config = Params.from_file(os.path.join(serialization_dir, CONFIG_NAME), overrides)
    config.loading_from_archive = True

    if weights_file:
        weights_path = weights_file
    else:
        weights_path = os.path.join(serialization_dir, _WEIGHTS_NAME)

    # Instantiate model. Use a duplicate of the config, as it will get consumed.
    model = Model.load(config.duplicate(),
                       weights_file=weights_path,
                       serialization_dir=serialization_dir,
                       cuda_device=cuda_device)

    return Archive(model=model, config=config)
Ejemplo n.º 7
0
        def train_func(config, reporter):
            logger.debug(
                f"CUDA_VISIBLE_DEVICES: {os.environ['CUDA_VISIBLE_DEVICES']}")

            for package_name in getattr(run_args, "include_package", ()):
                import_submodules(package_name)

            run_parameters = {k: json.dumps(v) for k, v in config.items()}

            file_dict = json.loads(
                _jsonnet.evaluate_snippet("config",
                                          parameter_file_snippet,
                                          tla_codes=run_parameters))
            if default_args.num_gpus == 0:
                logger.warning(f"No GPU specified, using CPU.")
                file_dict["trainer"]["cuda_device"] = -1

            overrides_dict = parse_overrides(run_args.overrides)

            params_dict = with_fallback(preferred=overrides_dict,
                                        fallback=file_dict)

            # Make sure path is absolute (as Ray workers do not use the same working dir)
            train_data_path = params_dict["train_data_path"]
            validation_data_path = params_dict.get("validation_data_path")

            if not os.path.isabs(train_data_path):
                params_dict["train_data_path"] = os.path.abspath(
                    os.path.join(default_args.cwd, train_data_path))

            if validation_data_path and not os.path.isabs(
                    validation_data_path):
                params_dict["validation_data_path"] = os.path.abspath(
                    os.path.join(default_args.cwd, validation_data_path))

            params = Params(params_dict)

            logger.debug(f"AllenNLP Configuration: {params.as_dict()}")

            train_model(params=params, serialization_dir="./trial/")

            reporter(done=True)
Ejemplo n.º 8
0
    def from_file(params_file: str,
                  params_overrides: str = "",
                  ext_vars: dict = None) -> 'Params':
        """
        Load a `Params` object from a configuration file.

        Parameters
        ----------
        params_file : ``str``
            The path to the configuration file to load.
        params_overrides : ``str``, optional
            A dict of overrides that can be applied to final object.
            e.g. {"model.embedding_dim": 10}
        ext_vars : ``dict``, optional
            Our config files are Jsonnet, which allows specifying external variables
            for later substitution. Typically we substitute these using environment
            variables; however, you can also specify them here, in which case they
            take priority over environment variables.
            e.g. {"HOME_DIR": "/Users/allennlp/home"}
        """
        if ext_vars is None:
            ext_vars = {}

        options = ext_vars
        # Escape values with `json.dumps` in order to preserve type
        ext_vars = {k: str(v) for k, v in ext_vars.items()}  # json.dumps

        # redirect to cache, if necessary
        params_file = cached_path(params_file)
        ext_vars = {**_environment_variables(), **ext_vars}

        file_dict = json.loads(evaluate_file(params_file, ext_vars=ext_vars))

        overrides_dict = parse_overrides(params_overrides)
        param_dict = with_fallback(preferred=overrides_dict,
                                   fallback=file_dict)

        return OptionsParams(param_dict, options=options)
Ejemplo n.º 9
0
def _load_archive(archive_file: str,
                  adapters_dir: str,
                 cuda_device: int = -1,
                 overrides: str = "",
                 weights_file: str = None):
    """
    Instantiates an Archive from an archived `tar.gz` file.

    Parameters
    ----------
    archive_file: ``str``
        The archive file to load the model from.
    weights_file: ``str``, optional (default = None)
        The weights file to use.  If unspecified, weights.th in the archive_file will be used.
    cuda_device: ``int``, optional (default = -1)
        If `cuda_device` is >= 0, the model will be loaded onto the
        corresponding GPU. Otherwise it will be loaded onto the CPU.
    overrides: ``str``, optional (default = "")
        JSON overrides to apply to the unarchived ``Params`` object.
    """

    # redirect to the cache, if necessary
    resolved_archive_file = cached_path(archive_file)

    if resolved_archive_file == archive_file:
        logger.info(f"loading archive file {archive_file}")
    else:
        logger.info(f"loading archive file {archive_file} from cache at {resolved_archive_file}")

    if os.path.isdir(resolved_archive_file):
        serialization_dir = resolved_archive_file
    else:
        # Extract archive to temp dir
        tempdir = tempfile.mkdtemp()
        logger.info(f"extracting archive file {resolved_archive_file} to temp dir {tempdir}")
        with tarfile.open(resolved_archive_file, 'r:gz') as archive:
            archive.extractall(tempdir)
        # Postpone cleanup until exit in case the unarchived contents are needed outside
        # this function.
        atexit.register(_cleanup_archive_dir, tempdir)

        serialization_dir = tempdir

    # Check for supplemental files in archive
    fta_filename = os.path.join(serialization_dir, "files_to_archive.json")
    if os.path.exists(fta_filename):
        with open(fta_filename, 'r') as fta_file:
            files_to_archive = json.loads(fta_file.read())

        # Add these replacements to overrides
        replacements_dict: Dict[str, Any] = {}
        for key, original_filename in files_to_archive.items():
            replacement_filename = os.path.join(serialization_dir, f"fta/{key}")
            if os.path.exists(replacement_filename):
                replacements_dict[key] = replacement_filename
            else:
                logger.warning(f"Archived file {replacement_filename} not found! At train time "
                               f"this file was located at {original_filename}. This may be "
                               "because you are loading a serialization directory. Attempting to "
                               "load the file from its train-time location.")

        overrides_dict = parse_overrides(overrides)
        combined_dict = with_fallback(preferred=overrides_dict, fallback=unflatten(replacements_dict))
        overrides = json.dumps(combined_dict)

    # Load config
    config = Params.from_file(os.path.join(serialization_dir, "config.json"), overrides)
    config.loading_from_archive = True

    if weights_file:
        weights_path = weights_file
    else:
        weights_path = os.path.join(serialization_dir, "weights.th")
        # Fallback for serialization directories.
        if not os.path.exists(weights_path):
            weights_path = os.path.join(serialization_dir, "best.th")


    # Instantiate model. Use a duplicate of the config, as it will get consumed.
    model = _load(config.duplicate(),
                  adapters_dir=adapters_dir,
                  weights_file=weights_path,
                  serialization_dir=serialization_dir,
                  cuda_device=cuda_device)

    return Archive(model=model, config=config)
Ejemplo n.º 10
0
def load_archive(archive_file: str,
                 cuda_device: int = -1,
                 overrides: str = "",
                 weights_file: str = None) -> Archive:
    """
    Instantiates an Archive from an archived `tar.gz` file.

    Parameters
    ----------
    archive_file: ``str``
        The archive file to load the model from.
    weights_file: ``str``, optional (default = None)
        The weights file to use.  If unspecified, weights.th in the archive_file will be used.
    cuda_device: ``int``, optional (default = -1)
        If `cuda_device` is >= 0, the model will be loaded onto the
        corresponding GPU. Otherwise it will be loaded onto the CPU.
    overrides: ``str``, optional (default = "")
        JSON overrides to apply to the unarchived ``Params`` object.
    """
    # redirect to the cache, if necessary
    resolved_archive_file = cached_path(archive_file)

    if resolved_archive_file == archive_file:
        logger.info(f"loading archive file {archive_file}")
    else:
        logger.info(
            f"loading archive file {archive_file} from cache at {resolved_archive_file}"
        )

    tempdir = None
    if os.path.isdir(resolved_archive_file):
        serialization_dir = resolved_archive_file
    else:
        # Extract archive to temp dir
        tempdir = tempfile.mkdtemp()
        logger.info(
            f"extracting archive file {resolved_archive_file} to temp dir {tempdir}"
        )
        with tarfile.open(resolved_archive_file, 'r:gz') as archive:
            archive.extractall(tempdir)

        serialization_dir = tempdir

    # Check for supplemental files in archive
    fta_filename = os.path.join(serialization_dir, _FTA_NAME)
    if os.path.exists(fta_filename):
        with open(fta_filename, 'r') as fta_file:
            files_to_archive = json.loads(fta_file.read())

        # Add these replacements to overrides
        replacements_dict: Dict[str, Any] = {}
        for key, _ in files_to_archive.items():
            replacement_filename = os.path.join(serialization_dir,
                                                f"fta/{key}")
            replacements_dict[key] = replacement_filename

        overrides_dict = parse_overrides(overrides)
        combined_dict = with_fallback(preferred=unflatten(replacements_dict),
                                      fallback=overrides_dict)
        overrides = json.dumps(combined_dict)

    # Load config
    config = Params.from_file(os.path.join(serialization_dir, CONFIG_NAME),
                              overrides)
    config.loading_from_archive = True

    if weights_file:
        weights_path = weights_file
    else:
        weights_path = os.path.join(serialization_dir, _WEIGHTS_NAME)

    # Instantiate model. Use a duplicate of the config, as it will get consumed.
    model = Model.load(config.duplicate(),
                       weights_file=weights_path,
                       serialization_dir=serialization_dir,
                       cuda_device=cuda_device)

    if tempdir:
        # Clean up temp dir
        shutil.rmtree(tempdir)

    return Archive(model=model, config=config)
Ejemplo n.º 11
0
def load_archive(archive_file: str,
                 cuda_device: int = -1,
                 overrides: str = "",
                 weights_file: str = None) -> Archive:
    """
    Instantiates an Archive from an archived `tar.gz` file.

    Parameters
    ----------
    archive_file: ``str``
        The archive file to load the model from.
    weights_file: ``str``, optional (default = None)
        The weights file to use.  If unspecified, weights.th in the archive_file will be used.
    cuda_device: ``int``, optional (default = -1)
        If `cuda_device` is >= 0, the model will be loaded onto the
        corresponding GPU. Otherwise it will be loaded onto the CPU.
    overrides: ``str``, optional (default = "")
        JSON overrides to apply to the unarchived ``Params`` object.
    """
    # redirect to the cache, if necessary
    resolved_archive_file = cached_path(archive_file)

    if resolved_archive_file == archive_file:
        logger.info(f"loading archive file {archive_file}")
    else:
        logger.info(f"loading archive file {archive_file} from cache at {resolved_archive_file}")

    if os.path.isdir(resolved_archive_file):
        serialization_dir = resolved_archive_file
    else:
        # Extract archive to temp dir
        tempdir = tempfile.mkdtemp()
        logger.info(f"extracting archive file {resolved_archive_file} to temp dir {tempdir}")
        with tarfile.open(resolved_archive_file, 'r:gz') as archive:
            archive.extractall(tempdir)
        # Postpone cleanup until exit in case the unarchived contents are needed outside
        # this function.
        atexit.register(_cleanup_archive_dir, tempdir)

        serialization_dir = tempdir

    # Check for supplemental files in archive
    fta_filename = os.path.join(serialization_dir, _FTA_NAME)
    if os.path.exists(fta_filename):
        with open(fta_filename, 'r') as fta_file:
            files_to_archive = json.loads(fta_file.read())

        # Add these replacements to overrides
        replacements_dict: Dict[str, Any] = {}
        for key, _ in files_to_archive.items():
            replacement_filename = os.path.join(serialization_dir, f"fta/{key}")
            replacements_dict[key] = replacement_filename

        overrides_dict = parse_overrides(overrides)
        combined_dict = with_fallback(preferred=unflatten(replacements_dict), fallback=overrides_dict)
        overrides = json.dumps(combined_dict)

    # Load config
    config = Params.from_file(os.path.join(serialization_dir, CONFIG_NAME), overrides)
    config.loading_from_archive = True

    if weights_file:
        weights_path = weights_file
    else:
        weights_path = os.path.join(serialization_dir, _WEIGHTS_NAME)

    # Instantiate model. Use a duplicate of the config, as it will get consumed.
    model = Model.load(config.duplicate(),
                       weights_file=weights_path,
                       serialization_dir=serialization_dir,
                       cuda_device=cuda_device)

    return Archive(model=model, config=config)
Ejemplo n.º 12
0
def setup(args):
    """ Create the blackbox function to optimize.

    This is a complex function that wraps the true parameter setting and training
    in subprocess calls to allennlp.
    """
    base_config = json.loads(_jsonnet.evaluate_file(args.base_config_path))
    search_config = json.loads(_jsonnet.evaluate_file(args.search_config_path))
    arg_overrides = parse_overrides(args.overrides)

    # Flatten configs and get shorthand mappings
    flat_base_config = flatten(base_config)
    flat_search_config = flatten(search_config)
    shorthands = get_shorthands(flat_search_config)

    # Extract any variable dimensions and the mapping to their keys
    search_space = extract_search_space(flat_search_config)
    lambdas = extract_lambdas(flat_search_config)
    dimensions = list(search_space.values())

    # We no longer use the base config as an initial point because the base config
    # needs to be minimal -- cannot contain fields which aren't used by certain hp
    # configurations since overrides cannot "delete" a field in the base config.
    x0 = None  # get_x0(flat_base_config, search_space)

    trial_num = 0
    trial_paths = dict()

    # Construct f
    def f(x):
        nonlocal trial_num
        nonlocal trial_paths

        # Map the x to the config keys that need updated
        newx = []
        for d,p in zip(dimensions, x):
            print(d.name, d, p, type(p))
            if 'numpy' in str(type(p)):
                p = p.item()
            newx.append(p)
        x = newx
        overrides = skopt.utils.point_asdict(search_space, x)
        overrides = fill_search_constants(overrides, flat_search_config)
        overrides = restrict_type_overrides(overrides, flat_search_config)

        # print(f'Overrides after fill and restrict: {json.dumps(overrides, indent=2)}')

        # Construct the trial serialization path
        trial_str = construct_trial_name(overrides, shorthands, trial_num)
        trial_path = os.path.join(args.serialization_dir, trial_str)
        trial_paths[trial_num] = trial_path

        # Construct the overrides string
        processed_overrides = format_overrides(overrides, lambdas, base_config, arg_overrides)
        print(f'Sampled config: {json.dumps(processed_overrides, indent=2)}')
        override_str = json.dumps(processed_overrides, indent=None)

        # Run Allennlp train subprocess
        cmd = f"allennlp train {args.base_config_path} -f -s {trial_path} -o '{override_str}' --file-friendly-logging --include-package {args.include_package}"
        print(f'CMD: {cmd}')
        try:
            subprocess.check_call(cmd, shell=True)
        except Exception as e:
            logger.error(e, exc_info=True)
            raise e

        trial_num += 1

        # Retrieve the best validation metric and return that value
        metrics = json.load(open(os.path.join(trial_path, 'metrics.json')))
        validation_metric = base_config['trainer']['validation_metric']
        negate = validation_metric.startswith('+')
        validation_metric = validation_metric.lstrip('+-')
        y = metrics[f'best_validation_{validation_metric}']
        if negate:
            y = -y

        return y

    # Construct a callback which maintains only the best weights/archive
    def delete_worse_files_cb(results):
        """ Remove .th and .gz files for any trials that aren't the best so far.
        """
        nonlocal trial_num
        nonlocal trial_paths
        logger.info(f'DELETE WORSE FILES, trial num:{trial_num}')

        best_trial_num = np.argmin(results.func_vals).item()
        logger.info(f'Func values: {results.func_vals},  best is {best_trial_num} with path {trial_paths[best_trial_num]}')
        for i in range(trial_num):
            if i != best_trial_num:
                logger.info(f'Deleting .th and .gz files at {trial_paths[i]}')
                th_path = os.path.join(trial_paths[i], '*.th')
                gz_path = os.path.join(trial_paths[i], '*.gz')
                cmd = f"rm -f {th_path} && rm -f {gz_path}"
                try:
                    subprocess.check_call(cmd, shell=True)
                except Exception as e:
                    logger.error(e, exc_info=True)
                    raise e

    return f, dimensions, x0, trial_paths, delete_worse_files_cb