Example #1
0
        def train_func(config, reporter):
            logger.debug(
                f"CUDA_VISIBLE_DEVICES: {os.environ['CUDA_VISIBLE_DEVICES']}")

            for package_name in getattr(args, "include_package", ()):
                import_module_and_submodules(package_name)

            search_space = HyperparameterSearch(**config)
            sample = search_space.sample()
            for k, v in sample.items():
                config[k] = str(v)

            params_dict = json.loads(
                _jsonnet.evaluate_snippet("config",
                                          parameter_file_snippet,
                                          tla_codes={},
                                          ext_vars=config))
            if args.num_gpus == 0:
                logger.warning(f"No GPU specified, using CPU.")
                params_dict["trainer"]["cuda_device"] = -1

            if args.cpus_per_trial > 0:
                torch.set_num_threads(args.cpus_per_trial)

            params = Params(params_dict)

            logger.debug(f"AllenNLP Configuration: {params.as_dict()}")

            train_model(params=params, serialization_dir="trial")

            reporter(done=True)
Example #2
0
def main(args: argparse.Namespace):

    for package_name in args.include_package:
        import_module_and_submodules(package_name)

    archive = load_archive(args.archive_file, args.cuda_device, args.overrides,
                           args.weights_file)
    config = archive.config
    prepare_environment(config)
    model = archive.model
    model.eval()

    if "annotator" not in config:
        raise ConfigurationError(
            "Key 'annotator' is missing, sorry, cannot perform annotation")

    annotator = Annotator.from_params(config["annotator"])

    if annotator is None:
        raise ConfigurationError(
            "Trained model doesn't have an 'annotator' defined in config file."
        )
    t1 = time.time()
    annotator.annotate_file(model, args.input_file, args.output_file)
    t2 = time.time()
    print("Predicting took", round(t2 - t1, 3), "seconds.")
Example #3
0
def evaluate(serialization_dir: str, data_path: str, cuda: int, overrides: str,
             include_package: str):
    include_package = include_package.split(",")
    for package in include_package:
        import_module_and_submodules(package)

    serialization_dir = Path(serialization_dir)

    config_path = serialization_dir / "config.json"
    params = Params.from_file(config_path, overrides)

    data_path = data_path or params["test_data_path"]

    evaluate_args = argparse.Namespace(
        archive_file=str(serialization_dir / "model.tar.gz"),
        input_file=data_path,
        output_file=str(serialization_dir / "metrics_test.json"),
        cuda_device=cuda,
        overrides=overrides,
        weights_file=None,
        extend_vocab=None,
        embedding_sources_mapping=None,
        batch_weight_key=None,
        batch_size=None,
    )
    evaluate_from_args(evaluate_args)
def main(archive_file: str, save_directory: str) -> None:
    """Saves the model and tokenizer from an AllenNLP `archive_file` path pointing to a trained
    DeCLUTR model to a format that can be used with HuggingFace Transformers at `save_directory`."""
    save_directory = Path(save_directory)
    save_directory.parents[0].mkdir(parents=True, exist_ok=True)

    common_util.import_module_and_submodules("declutr")
    # cuda_device -1 places the model onto the CPU before saving. This avoids issues with
    # distributed models.
    overrides = "{'trainer.cuda_device': -1}"
    archive = load_archive(archive_file, overrides=overrides)
    predictor = Predictor.from_archive(archive, predictor_name="declutr")

    token_embedder = predictor._model._text_field_embedder._token_embedders[
        "tokens"]
    model = token_embedder.transformer_model
    tokenizer = token_embedder.tokenizer

    # Casting as a string to avoid this error: https://github.com/huggingface/transformers/pull/4650
    # Can be removed after PR is merged and Transformers is updated.
    model.save_pretrained(str(save_directory))
    tokenizer.save_pretrained(str(save_directory))

    typer.secho(
        (f"{SAVING} {HUGGING_FACE} Transformers compatible model saved to: {save_directory}."
         " See https://huggingface.co/transformers/model_sharing.html for instructions on"
         f" hosting the model with {HUGGING_FACE} Transformers."),
        bold=True,
    )
def main(args):
    """Driver function whose behavior is configured by a rich set of flags."""
    for package_name in args.include_package:
        import_module_and_submodules(package_name)

    vocab = Vocabulary.from_files("%s/vocabulary" % args.model_path)
    params = Params.from_file("%s/config.json" % args.model_path)
    model = _load_model(vocab, params.pop("model"), args)

    if not args.activations:
        x_name = "weight magnitude"
        parameters = [torch.flatten(param) for param in model.parameters()]
        parameters = torch.cat(parameters).abs()
        data = parameters.detach().numpy()

    else:
        x_name = "activation magnitude"
        sizes = {(768, 768), (768, 3072), (3072, 768)}
        parameters = [
            param for param in model.parameters()
            if tuple(param.size()) in sizes
        ]
        activations = torch.cat(
            [param.norm(p=2, dim=1) for param in parameters])
        data = activations.detach().numpy()

    plt.hist(data, bins=args.bins, range=[args.min, args.max])
    plt.xlabel(x_name)
    if args.log:
        plt.yscale("log")
    plt.savefig(args.save)
    print(f"Saved {args.save}.")
Example #6
0
def import_module_and_submodules_new(package_name: str) -> None:
    """
    Import all submodules under the given package.
    Primarily useful so that people using AllenNLP as a library
    can specify their own custom packages and have their custom
    classes get loaded and registered.
    """
    importlib.invalidate_caches()

    # For some reason, python doesn't always add this by default to your path, but you pretty much
    # always want it when using `--include-package`.  And if it's already there, adding it again at
    # the end won't hurt anything.
    with push_python_path("."):
        # Import at top level
        try:
            module = importlib.import_module(package_name)
        except Exception as e:
            print(e)
            module = None
        if module:
            path = getattr(module, "__path__", [])
            path_string = "" if not path else path[0]

            # walk_packages only finds immediate children, so need to recurse.
            for module_finder, name, _ in pkgutil.walk_packages(path):
                # Sometimes when you import third-party libraries that are on your path,
                # `pkgutil.walk_packages` returns those too, so we need to skip them.
                if path_string and module_finder.path != path_string:
                    continue
                subpackage = f"{package_name}.{name}"
                import_module_and_submodules(subpackage)
Example #7
0
def get_paragraphs(args):
    import_module_and_submodules("src")
    os.environ["weights"] = ""

    overrides_dict = {}
    ext_vars = {}
    params = Params.from_file(args.config_file, json.dumps(overrides_dict), ext_vars)
    dataset_reader = DatasetReader.from_params(
        params.get("validation_dataset_reader", "dataset_reader")
    )

    data_path = args.data
    if data_path is None:
        data_path = params.get("validation_data_path", None)
    assert data_path is not None, "--data is required"

    retrieved_paragraphs = {}
    for instance in dataset_reader.read(data_path):
        paragraphs_objs = instance["metadata"]["paragraphs"]
        if paragraphs_objs is not None:
            retrieved_paragraphs[instance["metadata"]["qid"]] = [
                p["evidence_id"] for p in paragraphs_objs
            ]
        else:
            retrieved_paragraphs[instance["metadata"]["qid"]] = []

    with open(args.output_file, "w", encoding="utf-8") as f:
        json.dump(retrieved_paragraphs, f, ensure_ascii=False, indent=4)
Example #8
0
def main(verbosity: int, include_package):
    """Cli entry point for fintopics."""
    config.clear()
    config.update(toml.load('textvinf.toml'))
    configure_logger(dict(config['logging']), verbosity)
    for pkg in include_package:
        import_module_and_submodules(pkg)
def allennlp(
    path_to_senteval: str,
    path_to_allennlp_archive: str,
    output_filepath: str = None,
    weights_file: str = None,
    cuda_device: int = -1,
    output_dict_field: str = "embeddings",
    predictor_name: str = None,
    include_package: List[str] = None,
    prototyping_config: bool = False,
    verbose: bool = False,
) -> None:
    """Evaluates a trained AllenNLP model against the SentEval benchmark."""

    from allennlp.models.archival import load_archive
    from allennlp.predictors import Predictor

    # SentEval prepare and batcher
    def prepare(params, samples):
        return

    @torch.no_grad()
    def batcher(params, batch):
        batch = _cleanup_batch(batch)
        # Re-tokenize the input text using the tokenizer of the dataset reader
        inputs = [{"text": " ".join(tokens)} for tokens in batch]
        outputs = params.predictor.predict_batch_json(inputs)
        # AllenNLP models return a dictionary, so access the embeddings with the given key.
        embeddings = [output[output_dict_field] for output in outputs]

        embeddings = np.vstack(embeddings)
        return embeddings

    # Allows us to import custom dataset readers and models that may exist in the AllenNLP archive.
    # See: https://tinyurl.com/whkmoqh
    include_package = include_package or []
    for package_name in include_package:
        common_util.import_module_and_submodules(package_name)

    # Load the archived Model
    archive = load_archive(
        path_to_allennlp_archive,
        cuda_device=cuda_device,
        weights_file=weights_file,
        overrides="{'trainer.use_amp': true}",
    )
    predictor = Predictor.from_archive(archive, predictor_name)
    typer.secho(
        f'{SUCCESS} Model from AllenNLP archive "{path_to_allennlp_archive}" loaded successfully.',
        fg=typer.colors.GREEN,
        bold=True,
    )

    # Performs a few setup steps and returns the SentEval params
    params_senteval = _setup_senteval(path_to_senteval, prototyping_config, verbose)
    params_senteval["predictor"] = predictor
    _run_senteval(params_senteval, path_to_senteval, batcher, prepare, output_filepath)

    return
Example #10
0
def run_tango(
    params: Params,
    serialization_dir: Union[str, PathLike],
    include_package: Optional[List[str]] = None,
    dry_run: bool = False,
    file_friendly_logging: bool = False,
):
    common_logging.FILE_FRIENDLY_LOGGING = file_friendly_logging

    if include_package is not None:
        for package_name in include_package:
            common_util.import_module_and_submodules(package_name)

    common_util.prepare_environment(params)

    step_graph = step_graph_from_params(params.pop("steps"))

    serialization_dir = Path(serialization_dir)
    serialization_dir.mkdir(parents=True, exist_ok=True)
    step_cache = DirectoryStepCache(serialization_dir / "step_cache")

    if dry_run:
        for step, cached in tango_dry_run(
            (s for s in step_graph.values() if not s.only_if_needed),
                step_cache):
            if cached:
                print(f"Getting {step.name} from cache")
            else:
                print(f"Computing {step.name}")
    else:
        # remove symlinks to old results
        for filename in serialization_dir.glob("*"):
            if filename.is_symlink():
                relative_target = os.readlink(filename)
                if not relative_target.startswith("step_cache/"):
                    continue
                logger.info(
                    f"Removing symlink '{filename.name}' to previous result {relative_target}"
                )
                filename.unlink()

        # produce results
        for name, step in step_graph.items():
            if not step.only_if_needed:
                step.ensure_result(step_cache)

        # symlink everything that has been computed
        for name, step in step_graph.items():
            if step in step_cache:
                step_link = serialization_dir / name
                step_link.unlink(missing_ok=True)
                step_link.symlink_to(
                    step_cache.path_for_step(step).relative_to(
                        serialization_dir),
                    target_is_directory=True,
                )
                print(f'The output for "{name}" is in {step_link}.')
Example #11
0
def load_dataset_reader_from_config(config_file_path: str,
                                    include_package: str = "models",
                                    reader_name: str = "dataset_reader",
                                    overrides: str = None):
    logging.disable(logging.INFO)
    import_module_and_submodules(include_package)
    params = Params.from_file(config_file_path, overrides)
    dataset_reader = DatasetReader.from_params(params.pop(reader_name))
    return dataset_reader
Example #12
0
 def __init__(self,
              pretrained_model_name_or_path: str,
              sphereize: bool = False,
              **kwargs) -> None:
     if pretrained_model_name_or_path in PRETRAINED_MODELS:
         pretrained_model_name_or_path = PRETRAINED_MODELS[
             pretrained_model_name_or_path]
     common_util.import_module_and_submodules("declutr")
     archive = load_archive(pretrained_model_name_or_path, **kwargs)
     self._predictor = Predictor.from_archive(archive,
                                              predictor_name="declutr")
     self._sphereize = sphereize
Example #13
0
def import_plugins() -> None:
    """
    Imports the plugins found with `discover_plugins()`.
    """
    for module in DEFAULT_PLUGINS:
        try:
            # For default plugins we recursively import everything.
            import_module_and_submodules(module)
        except ModuleNotFoundError:
            pass
    for module_name in discover_plugins():
        try:
            importlib.import_module(module_name)
        except ModuleNotFoundError as e:
            logger.error(f"Plugin {module_name} could not be loaded: {e}")
Example #14
0
 def __init__(
     self, pretrained_model_name_or_path: str, sphereize: bool = False, **kwargs
 ) -> None:
     if pretrained_model_name_or_path in PRETRAINED_MODELS:
         pretrained_model_name_or_path = PRETRAINED_MODELS[pretrained_model_name_or_path]
     pretrained_model_name_or_path = cached_path(pretrained_model_name_or_path)
     common_util.import_module_and_submodules("declutr")
     # Prevents a WARNING, which could confuse a user. Besides, performance is negatively
     # impacted when using mixed-precision during inference (in our case). Better to explicitly
     # prevent this scenario from happening.
     overrides = "{'trainer.opt_level': null}"
     archive = load_archive(pretrained_model_name_or_path, overrides=overrides, **kwargs)
     self._predictor = Predictor.from_archive(archive, predictor_name="declutr")
     self._output_dict_field = "embeddings"
     self._sphereize = sphereize
Example #15
0
    def test_import_submodules(self):
        (self.TEST_DIR / "mymodule").mkdir()
        (self.TEST_DIR / "mymodule" / "__init__.py").touch()
        (self.TEST_DIR / "mymodule" / "submodule").mkdir()
        (self.TEST_DIR / "mymodule" / "submodule" / "__init__.py").touch()
        (self.TEST_DIR / "mymodule" / "submodule" / "subsubmodule.py").touch()

        with push_python_path(self.TEST_DIR):
            assert "mymodule" not in sys.modules
            assert "mymodule.submodule" not in sys.modules

            util.import_module_and_submodules("mymodule")

            assert "mymodule" in sys.modules
            assert "mymodule.submodule" in sys.modules
            assert "mymodule.submodule.subsubmodule" in sys.modules
def get_paragraphs(args):
    import_module_and_submodules("src")
    os.environ["weights"] = ""

    if os.path.exists(QUERIES_CACHE_PATH):
        with open(QUERIES_CACHE_PATH, "r", encoding="utf8") as f:
            queries_cache = json.load(f)
    else:
        queries_cache = {}
    size_before = len(queries_cache)

    cleaned_queries_cache = {}

    combinations = {
        "configs/strategy_qa/3_STAR_IR-Q.jsonnet": [
            "data/strategyqa/train.json",
            "data/strategyqa/dev.json",
        ],
        "configs/strategy_qa/4_STAR_IR-D.jsonnet":
        ["data/strategyqa/dev.json"],
        "configs/strategy_qa/5_STAR_IR-ORA-D.jsonnet": [
            "data/strategyqa/train.json",
            "data/strategyqa/dev.json",
        ],
    }

    for config_file in combinations:
        for data_path in combinations[config_file]:
            overrides_dict = {}
            ext_vars = {}
            params = Params.from_file(config_file, json.dumps(overrides_dict),
                                      ext_vars)
            dataset_reader = DatasetReader.from_params(
                params.get("validation_dataset_reader", "dataset_reader"))

            for instance in dataset_reader.read(data_path):
                queries = instance["metadata"]["queries"]
                for query in queries:
                    if query in dataset_reader._queries_cache:
                        cleaned_queries_cache[
                            query] = dataset_reader._queries_cache[query]

    print(f"Size before: {size_before}")
    print(f"Size after: {len(cleaned_queries_cache)}")
    with open(args.output_file, "w", encoding="utf8") as f:
        json.dump(cleaned_queries_cache, f)
Example #17
0
def load_model_and_dataset_reader(
        arcive_file: str,
        include_package: str = "models",
        cuda_device: int = -1,
        overrides: str = None) -> Tuple[Model, DatasetReader]:
    logging.disable(logging.INFO)
    import_module_and_submodules(include_package)
    archive = load_archive(arcive_file,
                           cuda_device=cuda_device,
                           overrides=overrides)
    config = archive.config
    prepare_environment(config)

    model = archive.model
    model.eval()

    dataset_reader = DatasetReader.from_params(config.pop("dataset_reader"))
    return model, dataset_reader
Example #18
0
def main(prog: Optional[str] = None) -> None:
    """
    The [`run`](./train.md#run) command only knows about the registered classes in the ``allennlp``
    codebase. In particular, once you start creating your own `Model` s and so forth, it won't
    work for them, unless you use the ``--include-package`` flag or you make your code available
    as a plugin (see [`plugins`](./plugins.md)).
    """
    parser, args = parse_args(prog)

    # If a subparser is triggered, it adds its work as `args.func`.
    # So if no such attribute has been added, no subparser was triggered,
    # so give the user some help.
    if "func" in dir(args):
        # Import any additional modules needed (to register custom classes).
        for package_name in getattr(args, "include_package", []):
            import_module_and_submodules(package_name)
        args.func(args)
    else:
        parser.print_help()
Example #19
0
def instantiate_model_from_config(config_file_path: str,
                                  cuda_device: int = -1,
                                  overrides: str = None,
                                  include_package: str = "models") -> Model:
    logging.disable(logging.INFO)
    import_module_and_submodules(include_package)

    params = Params.from_file(config_file_path, overrides)

    vocab_dir = params.pop("vocabulary").pop("directory_path")
    vocab = Vocabulary.from_files(vocab_dir)

    model = Model.from_params(vocab=vocab, params=params.pop("model"))

    if cuda_device >= 0:
        model.cuda(cuda_device)
    else:
        model.cpu()

    return model
Example #20
0
def load_model_from_file(
    serialization_dir: str,
    weights_file: str = _DEFAULT_WEIGHTS,
    include_package: str = "models",
    cuda_device: int = -1,
    overrides: str = None,
):
    logging.disable(logging.INFO)
    config = Params.from_file(os.path.join(serialization_dir, CONFIG_NAME),
                              overrides)

    import_module_and_submodules(include_package)

    model = Model.load(
        config,
        weights_file=os.path.join(serialization_dir, weights_file),
        serialization_dir=serialization_dir,
        cuda_device=cuda_device,
    )
    return model
Example #21
0
def smart_data_evaluate(model_path, predictor, test_data, include_package,
                        output_dir, overwrite_dir, use_mock_predictor):
    prepare_dir(output_dir, overwrite_dir)
    common_util.import_module_and_submodules(include_package)
    if "mock" in predictor.lower():
        use_mock_predictor = True
    if use_mock_predictor:
        splitted = predictor.split(".")
        mod = __import__(".".join(splitted[:-1]), fromlist=[splitted[-1]])
        klass = getattr(mod, splitted[-1])
        current_predictor_class = klass
    else:
        current_predictor_class = Predictor.by_name(predictor)

    @timeit
    def load_model(path):
        archive = load_archive(path)
        archive.model.eval()
        return current_predictor_class.from_archive(archive)

    @timeit
    def eval_model(predictor, test_data_path):
        evaluate_runner = EvaluationRunner(predictor, test_data_path,
                                           output_dir)
        evaluate_runner.evaluate(
            MRE=respect_only_mandatory_args,
            Cl=only_relation_classification,
            CRE=all_args_mandatory,
            AR=named_entity_recognition_v2,
            BRE=spert_only_two_mandatory_args,
            MRE_no_trigger=respect_only_mandatory_args_no_trigger,
            AR_no_trigger=named_entity_recognition_v2_no_trigger)
        evaluate_runner.save_report()

    if not use_mock_predictor:
        predictor = load_model(model_path)

    else:
        predictor = current_predictor_class(model_path)

    eval_model(predictor, test_data)
Example #22
0
    def __init__(self, cache_dir=DEFAULT_CACHE_DIR, verbose=False):

        from allennlp.models.archival import load_archive
        from allennlp.common.util import import_module_and_submodules
        from allennlp.common.util import prepare_environment
        import_module_and_submodules("danlp.models.allennlp_models")
        from danlp.models.allennlp_models.coref.predictors.coref import CorefPredictor

        # download the model or load the model path
        model_path = download_model('xlmr.coref',
                                    cache_dir,
                                    process_func=_unzip_process_func,
                                    verbose=verbose)

        archive = load_archive(model_path)
        self.config = archive.config
        prepare_environment(self.config)
        self.model = archive.model
        self.dataset_reader = archive.validation_dataset_reader
        self.predictor = CorefPredictor(model=self.model,
                                        dataset_reader=self.dataset_reader)
Example #23
0
def import_plugins() -> None:
    """
    Imports the plugins found with `discover_plugins()`.
    """

    # Workaround for a presumed Python issue where spawned processes can't find modules in the current directory.
    cwd = os.getcwd()
    if cwd not in sys.path:
        sys.path.append(cwd)

    for module_name in DEFAULT_PLUGINS:
        try:
            # For default plugins we recursively import everything.
            import_module_and_submodules(module_name)
            logger.info("Plugin %s available", module_name)
        except ModuleNotFoundError:
            pass
    for module_name in discover_plugins():
        try:
            importlib.import_module(module_name)
            logger.info("Plugin %s available", module_name)
        except ModuleNotFoundError as e:
            logger.error(f"Plugin {module_name} could not be loaded: {e}")
Example #24
0
    def from_pretrained(cls,
                        path: str,
                        tokenizer=tokenizers.SpacyTokenizer(),
                        batch_size: int = 500,
                        cuda_device: int = -1):
        util.import_module_and_submodules("combo.commands")
        util.import_module_and_submodules("combo.models")
        util.import_module_and_submodules("combo.training")

        if os.path.exists(path):
            model_path = path
        else:
            try:
                logger.debug("Downloading model.")
                model_path = download.download_file(path)
            except Exception as e:
                logger.error(e)
                raise e

        archive = models.load_archive(model_path, cuda_device=cuda_device)
        model = archive.model
        dataset_reader = allen_data.DatasetReader.from_params(
            archive.config["dataset_reader"])
        return cls(model, dataset_reader, tokenizer, batch_size)
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("archive_file", type=str, help="path to an archived trained model")

    parser.add_argument(
        "input_file", type=str, help="path to the file containing the evaluation data"
    )

    parser.add_argument("--output-file", type=str, help="path to output file")

    parser.add_argument(
        "--weights-file", type=str, help="a path that overrides which weights file to use"
    )

    cuda_device = parser.add_mutually_exclusive_group(required=False)
    cuda_device.add_argument(
        "--cuda-device", type=int, default=-1, help="id of GPU to use (if any)"
    )

    parser.add_argument(
        "-o",
        "--overrides",
        type=str,
        default="",
        help="a JSON structure used to override the experiment configuration",
    )

    parser.add_argument(
        "--batch-size", type=int, help="If non-empty, the batch size to use during evaluation."
    )

    parser.add_argument(
        "--batch-weight-key",
        type=str,
        default="",
        help="If non-empty, name of metric used to weight the loss on a per-batch basis.",
    )

    parser.add_argument(
        "--extend-vocab",
        action="store_true",
        default=False,
        help="if specified, we will use the instances in your new dataset to "
        "extend your vocabulary. If pretrained-file was used to initialize "
        "embedding layers, you may also need to pass --embedding-sources-mapping.",
    )

    parser.add_argument(
        "--embedding-sources-mapping",
        type=str,
        default="",
        help="a JSON dict defining mapping from embedding module path to embedding "
        "pretrained-file used during training. If not passed, and embedding needs to be "
        "extended, we will try to use the original file paths used during training. If "
        "they are not available we will use random vectors for embedding extension.",
    )

    parser.add_argument(
        "--include-package",
        type=str,
        action="append",
        default=[],
        help="additional packages to include",
    )

    args = parser.parse_args()
    for package_name in args.include_package:
        import_module_and_submodules(package_name)
    parser.set_defaults(func=evaluate_from_args)
    evaluate_from_args(args)
Example #26
0
    def import_udify():
        from allennlp.common.util import import_module_and_submodules

        import_module_and_submodules("udify")
Example #27
0
def _train_worker(
    process_rank: int,
    params: Params,
    serialization_dir: Union[str, PathLike],
    include_package: List[str] = None,
    dry_run: bool = False,
    node_rank: int = 0,
    master_addr: str = "127.0.0.1",
    master_port: int = 29500,
    world_size: int = 1,
    distributed_device_ids: List[int] = None,
    file_friendly_logging: bool = False,
) -> Optional[Model]:
    """
    Helper to train the configured model/experiment. In distributed mode, this is spawned as a
    worker process. In a single GPU experiment, this returns the `Model` object and in distributed
    training, nothing is returned.

    # Parameters

    process_rank : `int`
        The process index that is initialized using the GPU device id.
    params : `Params`
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir : `str`
        The directory in which to save results and logs.
    include_package : `List[str]`, optional
        In distributed mode, since this function would have been spawned as a separate process,
        the extra imports need to be done again. NOTE: This does not have any effect in single
        GPU training.
    dry_run : `bool`, optional (default=`False`)
        Do not train a model, but create a vocabulary, show dataset statistics and other training
        information.
    node_rank : `int`, optional
        Rank of the node.
    master_addr : `str`, optional (default=`"127.0.0.1"`)
        Address of the master node for distributed training.
    master_port : `str`, optional (default=`"29500"`)
        Port of the master node for distributed training.
    world_size : `int`, optional
        The number of processes involved in distributed training.
    distributed_device_ids: `List[str]`, optional
        IDs of the devices used involved in distributed training.
    file_friendly_logging : `bool`, optional (default=`False`)
        If `True`, we add newlines to tqdm output, even on an interactive terminal, and we slow
        down tqdm's output to only once every 10 seconds.

    # Returns

    best_model : `Optional[Model]`
        The model with the best epoch weights or `None` if in distributed training or in dry run.
    """
    common_logging.FILE_FRIENDLY_LOGGING = file_friendly_logging

    common_logging.prepare_global_logging(
        serialization_dir,
        rank=process_rank,
        world_size=world_size,
    )
    common_util.prepare_environment(params)

    distributed = world_size > 1

    # not using `allennlp.common.util.is_master` as the process group is yet to be initialized
    master = process_rank == 0

    include_package = include_package or []

    if distributed:
        # Since the worker is spawned and not forked, the extra imports need to be done again.
        # Both the ones from the plugins and the ones from `include_package`.
        import_plugins()
        for package_name in include_package:
            common_util.import_module_and_submodules(package_name)

        num_procs_per_node = len(distributed_device_ids)
        # The Unique identifier of the worker process among all the processes in the
        # distributed training group is computed here. This is used while initializing
        # the process group using `init_process_group`
        global_rank = node_rank * num_procs_per_node + process_rank

        # Number of processes per node is useful to know if a process
        # is a master in the local node(node in which it is running)
        os.environ["ALLENNLP_PROCS_PER_NODE"] = str(num_procs_per_node)

        # In distributed training, the configured device is always going to be a list.
        # The corresponding gpu id for the particular worker is obtained by picking the id
        # from the device list with the rank as index
        gpu_id = distributed_device_ids[process_rank]  # type: ignore

        # Till now, "cuda_device" might not be set in the trainer params.
        # But a worker trainer needs to only know about its specific GPU id.
        params["trainer"]["cuda_device"] = gpu_id
        params["trainer"]["world_size"] = world_size
        params["trainer"]["distributed"] = True

        if gpu_id >= 0:
            torch.cuda.set_device(int(gpu_id))
            dist.init_process_group(
                backend="nccl",
                init_method=f"tcp://{master_addr}:{master_port}",
                world_size=world_size,
                rank=global_rank,
            )
        else:
            dist.init_process_group(
                backend="gloo",
                init_method=f"tcp://{master_addr}:{master_port}",
                world_size=world_size,
                rank=global_rank,
            )
        logging.info(f"Process group of world size {world_size} initialized "
                     f"for distributed training in worker {global_rank}")

    train_loop = TrainModel.from_params(
        params=params,
        serialization_dir=serialization_dir,
        local_rank=process_rank,
    )

    if dry_run:
        return None

    try:
        if distributed:  # let the setup get ready for all the workers
            dist.barrier()

        metrics = train_loop.run()
    except KeyboardInterrupt:
        # if we have completed an epoch, try to create a model archive.
        if master and os.path.exists(
                os.path.join(serialization_dir, _DEFAULT_WEIGHTS)):
            logging.info(
                "Training interrupted by the user. Attempting to create "
                "a model archive using the current best epoch weights.")
            archive_model(serialization_dir)
        raise

    if master:
        train_loop.finish(metrics)

    if not distributed:
        return train_loop.model

    return None
Example #28
0
def archive(request) -> Archive:
    if request.param in PRETRAINED_MODELS:
        pretrained_model_name_or_path = PRETRAINED_MODELS[request.param]
    common_util.import_module_and_submodules("declutr")
    pretrained_model_name_or_path = cached_path(pretrained_model_name_or_path)
    return load_archive(pretrained_model_name_or_path)
Example #29
0
 def __init__(self):
     # TODO: should just import the exact submodule we need.
     import_module_and_submodules("allennlp_models")
     c = config.Model.from_file(
         os.path.join(os.path.dirname(__file__), "model.json"))
     super().__init__(c)
Example #30
0
File: main.py Project: ipipan/combo
def run(_):
    """Run model."""
    # Imports are required to make Registrable modules visible without passing parameter
    util.import_module_and_submodules("combo.commands")
    util.import_module_and_submodules("combo.models")
    util.import_module_and_submodules("combo.training")

    if FLAGS.mode == "train":
        checks.file_exists(FLAGS.config_path)
        params = common.Params.from_file(FLAGS.config_path,
                                         ext_vars=_get_ext_vars())
        model_params = params.get("model").as_ordered_dict()
        serialization_dir = tempfile.mkdtemp(prefix="allennlp",
                                             dir=FLAGS.serialization_dir)
        model = train.train_model(params,
                                  serialization_dir=serialization_dir,
                                  file_friendly_logging=True)
        logger.info(f"Training model stored in: {serialization_dir}")

        if FLAGS.finetuning_training_data_path:
            for f in FLAGS.finetuning_training_data_path:
                checks.file_exists(f)

            # Loading will be performed from stored model.tar.gz
            del model
            if torch.cuda.is_available():
                torch.cuda.empty_cache()

            params = common.Params.from_file(
                FLAGS.config_path, ext_vars=_get_ext_vars(finetuning=True))
            # Replace model definition with pretrained archive
            params["model"] = {
                "type": "from_archive",
                "archive_file": serialization_dir + "/model.tar.gz",
            }
            serialization_dir = tempfile.mkdtemp(prefix="allennlp",
                                                 suffix="-finetuning",
                                                 dir=FLAGS.serialization_dir)
            model = train.train_model(params.duplicate(),
                                      serialization_dir=serialization_dir,
                                      file_friendly_logging=True)

            # Make finetuning model serialization independent from training serialization
            # Storing model definition instead of archive
            params["model"] = model_params
            params.to_file(
                os.path.join(serialization_dir, archival.CONFIG_NAME))
            archival.archive_model(serialization_dir)

            logger.info(f"Finetuned model stored in: {serialization_dir}")

        if FLAGS.test_path and FLAGS.output_file:
            checks.file_exists(FLAGS.test_path)
            params = common.Params.from_file(
                FLAGS.config_path, ext_vars=_get_ext_vars())["dataset_reader"]
            params.pop("type")
            dataset_reader = dataset.UniversalDependenciesDatasetReader.from_params(
                params)
            predictor = predict.SemanticMultitaskPredictor(
                model=model, dataset_reader=dataset_reader)
            test_trees = dataset_reader.read(FLAGS.test_path)
            with open(FLAGS.output_file, "w") as file:
                for tree in test_trees:
                    file.writelines(
                        api.sentence2conllu(
                            predictor.predict_instance(tree),
                            keep_semrel=dataset_reader.use_sem).serialize())
    else:
        use_dataset_reader = FLAGS.conllu_format
        predictor = _get_predictor()
        if FLAGS.input_file == "-":
            use_dataset_reader = False
            predictor.without_sentence_embedding = True
        if use_dataset_reader:
            predictor.line_to_conllu = True
        if FLAGS.silent:
            logging.getLogger("allennlp.common.params").disabled = True
        manager = allen_predict._PredictManager(
            predictor,
            FLAGS.input_file,
            FLAGS.output_file,
            FLAGS.batch_size,
            not FLAGS.silent,
            use_dataset_reader,
        )
        manager.run()