def evaluate_from_args(args: argparse.Namespace) -> Dict[str, Any]: # Disable some of the more verbose logging statements logging.getLogger('allennlp.common.params').disabled = True logging.getLogger('allennlp.nn.initializers').disabled = True logging.getLogger('allennlp.modules.token_embedders.embedding').setLevel(logging.INFO) # Load parameter file with open(args.config_file) as config_file: config = Params(replace_none(json.loads(config_file.read()))) model = Model.load(config, weights_file=args.weights_file, cuda_device=args.cuda_device) model.eval() vocab = model._vocab # pylint: disable=protected-access # Load the evaluation data dataset_reader = DatasetReader.from_params(config.pop('dataset_reader')) evaluation_data_path = args.evaluation_data_file logger.info("Reading evaluation data from %s", evaluation_data_path) dataset = dataset_reader.read(evaluation_data_path) dataset.index_instances(vocab) iterator = DataIterator.from_params(config.pop("iterator")) metrics = evaluate(model, dataset, iterator, args.cuda_device) logger.info("Finished evaluating.") logger.info("Metrics:") for key, metric in metrics.items(): logger.info("%s: %s", key, metric) return metrics
def load_model(archive_file: str, cuda_device: int = -1) -> Model: """ Instantiates a model from an archived `tar.gz` file. Parameters ---------- archive_file: ``str`` The archive file to load the model from. cuda_device: ``int``, optional (default = -1) If `cuda_device` is >= 0, the model will be loaded onto the corresponding GPU. Otherwise it will be loaded onto the CPU. """ # Extract archive to temp dir tempdir = tempfile.mkdtemp() logger.info("extracting archive file %s to temp dir %s", archive_file, tempdir) with tarfile.open(archive_file, 'r:gz') as archive: archive.extractall(tempdir) # Load config config = Params.from_file(os.path.join(tempdir, _CONFIG_NAME)) # Instantiate model model = Model.load(config, weights_file=os.path.join(tempdir, _WEIGHTS_NAME), serialization_prefix=tempdir, cuda_device=cuda_device) # Clean up temp dir shutil.rmtree(tempdir) return model
def load_archive(archive_file: str, cuda_device: int = -1) -> Archive: """ Instantiates an Archive from an archived `tar.gz` file. Parameters ---------- archive_file: ``str`` The archive file to load the model from. cuda_device: ``int``, optional (default = -1) If `cuda_device` is >= 0, the model will be loaded onto the corresponding GPU. Otherwise it will be loaded onto the CPU. """ # redirect to the cache, if necessary archive_file = cached_path(archive_file) # Extract archive to temp dir tempdir = tempfile.mkdtemp() logger.info("extracting archive file %s to temp dir %s", archive_file, tempdir) with tarfile.open(archive_file, 'r:gz') as archive: archive.extractall(tempdir) # Load config config = Params.from_file(os.path.join(tempdir, _CONFIG_NAME)) # Instantiate model. Use a duplicate of the config, as it will get consumed. model = Model.load(config.duplicate(), weights_file=os.path.join(tempdir, _WEIGHTS_NAME), serialization_dir=tempdir, cuda_device=cuda_device) # Clean up temp dir shutil.rmtree(tempdir) return Archive(model=model, config=config)
def _load_model(config, weights_path, serialization_dir, cuda_device): return Model.load( config, weights_file=weights_path, serialization_dir=serialization_dir, cuda_device=cuda_device, )
def run(model_path, test_path, config_path, output_path, batch_size): params_path = config_path or os.path.join(model_path, "config.json") params = Params.from_file(params_path) is_subwords = "tokenizer" in params["reader"] and params["reader"][ "tokenizer"]["type"] == "subword" reader = DatasetReader.from_params(params.pop("reader")) device = 0 if torch.cuda.is_available() else -1 model = Model.load(params, model_path, cuda_device=device) model.training = False predictor = Seq2SeqPredictor(model, reader) with open(output_path, "wt", encoding="utf-8") as w: for batch_number, batch in enumerate(get_batches( test_path, batch_size)): outputs = predictor.predict_batch_json(batch) assert len(outputs) == len(batch) for output in outputs: decoded_words = output["predicted_tokens"] if not decoded_words: decoded_words = ["заявил"] if not is_subwords: hyp = " ".join(decoded_words) else: hyp = "".join(decoded_words).replace("▁", " ").replace( "\n", "").strip() if len(hyp) <= 3: hyp = "заявил" w.write(hyp + "\n")
def load_archive( archive_file: str, cuda_device: int = -1, overrides: str = "", weights_file: str = None ) -> Archive: """ Instantiates an Archive from an archived `tar.gz` file. # Parameters archive_file : `str` The archive file to load the model from. weights_file : `str`, optional (default = None) The weights file to use. If unspecified, weights.th in the archive_file will be used. cuda_device : `int`, optional (default = -1) If `cuda_device` is >= 0, the model will be loaded onto the corresponding GPU. Otherwise it will be loaded onto the CPU. overrides : `str`, optional (default = "") JSON overrides to apply to the unarchived `Params` object. """ # redirect to the cache, if necessary resolved_archive_file = cached_path(archive_file) if resolved_archive_file == archive_file: logger.info(f"loading archive file {archive_file}") else: logger.info(f"loading archive file {archive_file} from cache at {resolved_archive_file}") if os.path.isdir(resolved_archive_file): serialization_dir = resolved_archive_file else: # Extract archive to temp dir tempdir = tempfile.mkdtemp() logger.info(f"extracting archive file {resolved_archive_file} to temp dir {tempdir}") with tarfile.open(resolved_archive_file, "r:gz") as archive: archive.extractall(tempdir) # Postpone cleanup until exit in case the unarchived contents are needed outside # this function. atexit.register(_cleanup_archive_dir, tempdir) serialization_dir = tempdir # Load config config = Params.from_file(os.path.join(serialization_dir, CONFIG_NAME), overrides) if weights_file: weights_path = weights_file else: weights_path = os.path.join(serialization_dir, _WEIGHTS_NAME) # Fallback for serialization directories. if not os.path.exists(weights_path): weights_path = os.path.join(serialization_dir, _DEFAULT_WEIGHTS) # Instantiate model. Use a duplicate of the config, as it will get consumed. model = Model.load( config.duplicate(), weights_file=weights_path, serialization_dir=serialization_dir, cuda_device=cuda_device, ) return Archive(model=model, config=config)
def load_model(weights_file, serialization_dir, cuda): config = Params.from_file(os.path.join(serialization_dir, CONFIG_NAME)) config.loading_from_archive = True model = Model.load(config.duplicate(), weights_file = weights_file, serialization_dir = serialization_dir, cuda_device = cuda) return model
def load_model(serialization_dir): config = Params.from_file(os.path.join(serialization_dir, CONFIG_NAME)) config.loading_from_archive = True cuda_device = int(config['trainer']['cuda_device']) cuda_device = -1 model = Model.load(config.duplicate(), weights_file=args.weights_file, serialization_dir=args.serialization_dir, cuda_device=cuda_device) return model
def load_archive(archive_file: str, cuda_device: int = -1, overrides: str = "") -> Archive: """ Instantiates an Archive from an archived `tar.gz` file. Parameters ---------- archive_file: ``str`` The archive file to load the model from. cuda_device: ``int``, optional (default = -1) If `cuda_device` is >= 0, the model will be loaded onto the corresponding GPU. Otherwise it will be loaded onto the CPU. overrides: ``str``, optional (default = "") HOCON overrides to apply to the unarchived ``Params`` object. """ # redirect to the cache, if necessary archive_file = cached_path(archive_file) # Extract archive to temp dir tempdir = tempfile.mkdtemp() logger.info("extracting archive file %s to temp dir %s", archive_file, tempdir) with tarfile.open(archive_file, 'r:gz') as archive: archive.extractall(tempdir) # Check for supplemental files in archive fta_filename = os.path.join(tempdir, _FTA_NAME) if os.path.exists(fta_filename): with open(fta_filename, 'r') as fta_file: files_to_archive = json.loads(fta_file.read()) # Add these replacements to overrides replacement_hocon = pyhocon.ConfigTree(root=True) for key, _ in files_to_archive.items(): replacement_filename = os.path.join(tempdir, f"fta/{key}") replacement_hocon.put(key, replacement_filename) overrides_hocon = pyhocon.ConfigFactory.parse_string(overrides) combined_hocon = replacement_hocon.with_fallback(overrides_hocon) overrides = json.dumps(combined_hocon) # Load config config = Params.from_file(os.path.join(tempdir, _CONFIG_NAME), overrides) config.loading_from_archive = True # Instantiate model. Use a duplicate of the config, as it will get consumed. model = Model.load(config.duplicate(), weights_file=os.path.join(tempdir, _WEIGHTS_NAME), serialization_dir=tempdir, cuda_device=cuda_device) # Clean up temp dir shutil.rmtree(tempdir) return Archive(model=model, config=config)
def predict(self, words): global model if not model: model = Model.load( config=self.config, serialization_dir=self.flags['difference_model_path']) self.default_predictor = Predictor.from_path( self.flags['difference_model_path']) self.predictor = DifferenceTaggerPredictor( self.default_predictor._model, dataset_reader=self.default_predictor._dataset_reader) annotation = self.predictor.predict_json({"sentence": words}) subj_annotation = [(t, w) for t, w in annotation if "SUBJ" in t] self.info(subj_annotation) return annotation
def load_archive_from_folder(archive_file: str, cuda_device: int = -1, overrides: str = "", weights_file: str = None) -> Archive: # redirect to the cache, if necessary resolved_archive_file = cached_path(archive_file) logger.info(f"loading model from direactory {archive_file}") serialization_dir = resolved_archive_file # Check for supplemental files in archive fta_filename = os.path.join(serialization_dir, _FTA_NAME) if os.path.exists(fta_filename): with open(fta_filename, 'r') as fta_file: files_to_archive = json.loads(fta_file.read()) # Add these replacements to overrides replacements_dict: Dict[str, Any] = {} for key, filename in files_to_archive.items(): if not filename.startswith("/"): filename = os.path.join(serialization_dir, f"fta/{key}") replacements_dict[key] = filename overrides_dict = parse_overrides(overrides) combined_dict = with_fallback(preferred=unflatten(replacements_dict), fallback=overrides_dict) overrides = json.dumps(combined_dict) # Load config config = Params.from_file(os.path.join(serialization_dir, CONFIG_NAME), overrides) config.loading_from_archive = True if weights_file: weights_path = weights_file else: weights_path = os.path.join(serialization_dir, _WEIGHTS_NAME) # Instantiate model. Use a duplicate of the config, as it will get consumed. model = Model.load(config.duplicate(), weights_file=weights_path, serialization_dir=serialization_dir, cuda_device=cuda_device) return Archive(model=model, config=config)
def main(weights_file, device): serialization_dir = serialization_dir = '/'.join( weights_file.split('/')[:-1]) config = Params.from_file(join(serialization_dir, 'config.json')) reader = DatasetReader.from_params(config['validation_dataset_reader']) model = Model.load(config=config, serialization_dir=serialization_dir, weights_file=weights_file, cuda_device=device).eval() predictions = get_predictions(model, serialization_dir, reader, device) # Write out predictions to file predictions_file = weights_file.split('/')[-1].split( '.')[0] + '_dev_pred.json' predictions_file = join(serialization_dir, predictions_file) with open(predictions_file, "w") as writer: writer.write(json.dumps(predictions, indent=4) + "\n")
def load_model_from_file( serialization_dir: str, weights_file: str = _DEFAULT_WEIGHTS, include_package: str = "models", cuda_device: int = -1, overrides: str = None, ): logging.disable(logging.INFO) config = Params.from_file(os.path.join(serialization_dir, CONFIG_NAME), overrides) import_module_and_submodules(include_package) model = Model.load( config, weights_file=os.path.join(serialization_dir, weights_file), serialization_dir=serialization_dir, cuda_device=cuda_device, ) return model
def get_model_runner(model_path, reader, model_config_path=None): config_path = model_config_path or os.path.join(model_path, "config.json") params = Params.from_file(config_path) device = 0 if torch.cuda.is_available() else -1 model = Model.load(params, model_path, cuda_device=device) model.eval() predictor = Seq2SeqPredictor(model, reader) print(model) print("Trainable params count: ", sum(p.numel() for p in model.parameters() if p.requires_grad)) def run_model(batch): outputs = predictor.predict_batch_json(batch) targets = [b.get('target') for b in batch] hyps = [] for output in outputs: decoded_words = output["predicted_tokens"] hyp = " ".join(decoded_words).strip() hyps.append(hyp) return targets, hyps return run_model
def load_testing_models_in_eval_mode_from_serialization_dir( s_dir, training_config_filename, name_of_attn_layer_to_replace="_sentence_attention", cuda_device=-1): loaded_params = Params.from_file(training_config_filename, "") model = Model.load(loaded_params, s_dir, cuda_device=cuda_device) original_attn_layer = getattr(model, name_of_attn_layer_to_replace) talkative_attn_layer = \ attn_tests_lib.TalkativeSimpleHanAttention(original_attn_layer, "temp_weights", "temp_vects/", 1) setattr(model, name_of_attn_layer_to_replace, talkative_attn_layer) just_the_classifier = \ attn_tests_lib.ClassifierFromAttnAndInputVects(model._output_logit) if cuda_device >= 0: model = model.cuda(device=cuda_device) just_the_classifier = just_the_classifier.cuda(device=cuda_device) model = model.eval() just_the_classifier = just_the_classifier.eval() return model, just_the_classifier
def train(model_path, train_path, val_path, seed, vocabulary_path=None, config_path=None, pretrained_path=None): assert os.path.isdir(model_path), "Model directory does not exist" set_seed(seed) config_path = config_path or os.path.join(model_path, "config.json") assert os.path.isfile(config_path), "Config file does not exist" params = Params.from_file(config_path) vocabulary_path = vocabulary_path or os.path.join(model_path, "vocabulary") assert os.path.exists( vocabulary_path ), "Vocabulary is not ready, do not forget to run preprocess.py first" vocabulary = Vocabulary.from_files(vocabulary_path) reader_params = params.duplicate().pop("reader", default=Params({})) reader = DatasetReader.from_params(reader_params) train_dataset = reader.read(train_path) val_dataset = reader.read(val_path) if val_path else None if not pretrained_path: model_params = params.pop("model") model = Model.from_params(model_params, vocab=vocabulary) else: model = Model.load(params, pretrained_path) print(model) print("Trainable params count: ", sum(p.numel() for p in model.parameters() if p.requires_grad)) iterator = DataIterator.from_params(params.pop('iterator')) iterator.index_with(vocabulary) trainer = Trainer.from_params(model, model_path, iterator, train_dataset, val_dataset, params.pop('trainer')) trainer.train()
def _load(cls, params: Params, vocab: Vocabulary, serialization_dir: str, weights_file: str, cuda_device: int = -1, **kwargs): if params.get('train', None): params.pop('train') inner_model = Model.load(params, serialization_dir, weights_file=weights_file, cuda_device=cuda_device) params.pop('model') if params.get('vocabulary', None): params.pop('vocabulary') model = NeuralNetLanguageModel.from_params(params, model=inner_model, vocab=vocab, **kwargs) return model
def train_model( params: Params, serialization_dir: Union[str, PathLike], recover: bool = False, force: bool = False, node_rank: int = 0, include_package: List[str] = None, dry_run: bool = False, file_friendly_logging: bool = False, ) -> Optional[Model]: """ Trains the model specified in the given [`Params`](../common/params.md#params) object, using the data and training parameters also specified in that object, and saves the results in `serialization_dir`. # Parameters params : `Params` A parameter object specifying an AllenNLP Experiment. serialization_dir : `str` The directory in which to save results and logs. recover : `bool`, optional (default=`False`) If `True`, we will try to recover a training run from an existing serialization directory. This is only intended for use when something actually crashed during the middle of a run. For continuing training a model on new data, see `Model.from_archive`. force : `bool`, optional (default=`False`) If `True`, we will overwrite the serialization directory if it already exists. node_rank : `int`, optional Rank of the current node in distributed training include_package : `List[str]`, optional In distributed mode, extra packages mentioned will be imported in trainer workers. dry_run : `bool`, optional (default=`False`) Do not train a model, but create a vocabulary, show dataset statistics and other training information. file_friendly_logging : `bool`, optional (default=`False`) If `True`, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. # Returns best_model : `Optional[Model]` The model with the best epoch weights or `None` if in dry run. """ common_logging.FILE_FRIENDLY_LOGGING = file_friendly_logging training_util.create_serialization_dir(params, serialization_dir, recover, force) params.to_file(os.path.join(serialization_dir, CONFIG_NAME)) distributed_params = params.params.pop("distributed", None) # If distributed isn't in the config and the config contains strictly # one cuda device, we just run a single training process. if distributed_params is None: model = _train_worker( process_rank=0, params=params, serialization_dir=serialization_dir, include_package=include_package, dry_run=dry_run, file_friendly_logging=file_friendly_logging, ) if not dry_run: archive_model(serialization_dir) return model # Otherwise, we are running multiple processes for training. else: # We are careful here so that we can raise a good error if someone # passed the wrong thing - cuda_devices are required. device_ids = distributed_params.pop("cuda_devices", None) multi_device = isinstance(device_ids, list) and len(device_ids) > 1 num_nodes = distributed_params.pop("num_nodes", 1) if not (multi_device or num_nodes > 1): raise ConfigurationError( "Multiple cuda devices/nodes need to be configured to run distributed training." ) check_for_gpu(device_ids) master_addr = distributed_params.pop("master_address", "127.0.0.1") master_port = distributed_params.pop("master_port", 29500) num_procs = len(device_ids) world_size = num_nodes * num_procs # Creating `Vocabulary` objects from workers could be problematic since # the data loaders in each worker will yield only `rank` specific # instances. Hence it is safe to construct the vocabulary and write it # to disk before initializing the distributed context. The workers will # load the vocabulary from the path specified. vocab_dir = os.path.join(serialization_dir, "vocabulary") if recover: vocab = Vocabulary.from_files(vocab_dir) else: vocab = training_util.make_vocab_from_params( params.duplicate(), serialization_dir, print_statistics=dry_run) params["vocabulary"] = { "type": "from_files", "directory": vocab_dir, "padding_token": vocab._padding_token, "oov_token": vocab._oov_token, } logging.info( "Switching to distributed training mode since multiple GPUs are configured | " f"Master is at: {master_addr}:{master_port} | Rank of this node: {node_rank} | " f"Number of workers in this node: {num_procs} | Number of nodes: {num_nodes} | " f"World size: {world_size}") mp.spawn( _train_worker, args=( params.duplicate(), serialization_dir, include_package, dry_run, node_rank, master_addr, master_port, world_size, device_ids, file_friendly_logging, ), nprocs=num_procs, ) if dry_run: return None else: archive_model(serialization_dir) model = Model.load(params, serialization_dir) return model
def load_archive( archive_file: str, cuda_device: int = -1, opt_level: str = None, overrides: str = "", weights_file: str = None, ) -> Archive: """ Instantiates an Archive from an archived `tar.gz` file. # Parameters archive_file : `str` The archive file to load the model from. cuda_device : `int`, optional (default = -1) If `cuda_device` is >= 0, the model will be loaded onto the corresponding GPU. Otherwise it will be loaded onto the CPU. opt_level : `str`, optional, (default = `None`) Each `opt_level` establishes a set of properties that govern Amp’s implementation of pure or mixed precision training. Must be a choice of `"O0"`, `"O1"`, `"O2"`, or `"O3"`. See the Apex [documentation](https://nvidia.github.io/apex/amp.html#opt-levels-and-properties) for more details. If `None`, defaults to the `opt_level` found in the model params. If `cuda_device==-1`, Amp is not used and this argument is ignored. overrides : `str`, optional (default = "") JSON overrides to apply to the unarchived `Params` object. weights_file : `str`, optional (default = None) The weights file to use. If unspecified, weights.th in the archive_file will be used. """ # redirect to the cache, if necessary resolved_archive_file = cached_path(archive_file) if resolved_archive_file == archive_file: logger.info(f"loading archive file {archive_file}") else: logger.info( f"loading archive file {archive_file} from cache at {resolved_archive_file}" ) if os.path.isdir(resolved_archive_file): serialization_dir = resolved_archive_file else: # Extract archive to temp dir tempdir = tempfile.mkdtemp() logger.info( f"extracting archive file {resolved_archive_file} to temp dir {tempdir}" ) with tarfile.open(resolved_archive_file, "r:gz") as archive: archive.extractall(tempdir) # Postpone cleanup until exit in case the unarchived contents are needed outside # this function. atexit.register(_cleanup_archive_dir, tempdir) serialization_dir = tempdir # Load config config = Params.from_file(os.path.join(serialization_dir, CONFIG_NAME), overrides) if weights_file: weights_path = weights_file else: weights_path = os.path.join(serialization_dir, _WEIGHTS_NAME) # Fallback for serialization directories. if not os.path.exists(weights_path): weights_path = os.path.join(serialization_dir, _DEFAULT_WEIGHTS) # Instantiate model. Use a duplicate of the config, as it will get consumed. model = Model.load( config.duplicate(), weights_file=weights_path, serialization_dir=serialization_dir, cuda_device=cuda_device, opt_level=opt_level, ) return Archive(model=model, config=config)
def main(serialization_dir, evaluation_data_file, split, cuda_device, weights_file, overrides): archive_file = os.path.join(serialization_dir, "model.tar.gz") logging_dir = os.path.join(serialization_dir, "logging") if os.path.isfile(archive_file): weights_file = None archive = load_archive(archive_file, cuda_device, overrides, weights_file) config = archive.config prepare_environment(config) prepare_global_logging(logging_dir, file_friendly_logging=False, file_name=split) model = archive.model else: # Load config config = Params.from_file(os.path.join(serialization_dir, CONFIG_NAME), overrides) prepare_environment(config) prepare_global_logging(logging_dir, file_friendly_logging=False, file_name=split) if weights_file: weights_path = os.path.join(serialization_dir, weights_file) else: weights_path = os.path.join(serialization_dir, _WEIGHTS_NAME) logger.info("Using weights_file located at : %s", weights_path) # Instantiate model. Use a duplicate of the config, as it will get consumed. model = Model.load(config.duplicate(), weights_file=weights_path, serialization_dir=serialization_dir, cuda_device=cuda_device) # Eval mode ON model.eval() # Load the evaluation data # Try to use the validation dataset reader if there is one - otherwise fall back # to the default dataset_reader used for both training and validation. validation_dataset_reader_params = config.pop('validation_dataset_reader', None) if validation_dataset_reader_params is not None: dataset_reader = DatasetReader.from_params(validation_dataset_reader_params) else: dataset_reader = DatasetReader.from_params(config.pop('dataset_reader')) if evaluation_data_file is None: logger.info("--evaluation_data_file not provided. So using --split=%s to read data", split) data_path_key = split + '_data_path' evaluation_data_path = config.pop(data_path_key) else: evaluation_data_path = evaluation_data_file logger.info("Reading evaluation data from %s", evaluation_data_path) instances = dataset_reader.read(evaluation_data_path) logger.info("No. of instances = %d", len(instances)) iterator = BasicIterator(batch_size=128) iterator.index_with(model.vocab) metrics, model_predictions = get_model_predictions(model, instances, iterator, args.cuda_device) logger.info("Finished evaluating.") logger.info("Metrics:") for key, metric in metrics.items(): logger.info("%s: %s", key, metric) write_predictions(serialization_dir=serialization_dir, instances=instances, model_predictions=model_predictions, split=split) analyze_gold_data(serialization_dir=serialization_dir, instances=instances, split=split) analyze_model_predictions(serialization_dir=serialization_dir, instances=instances, model_predictions=model_predictions, split=split) analyze_bio_violations(instances=instances, model_predictions=model_predictions)
def train_model( params: Params, serialization_dir: str, file_friendly_logging: bool = False, recover: bool = False, force: bool = False, cache_directory: str = None, cache_prefix: str = None, node_rank: int = 0, include_package: List[str] = None, ) -> Model: """ Trains the model specified in the given :class:`Params` object, using the data and training parameters also specified in that object, and saves the results in ``serialization_dir``. # Parameters params : ``Params`` A parameter object specifying an AllenNLP Experiment. serialization_dir : ``str`` The directory in which to save results and logs. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. recover : ``bool``, optional (default=False) If ``True``, we will try to recover a training run from an existing serialization directory. This is only intended for use when something actually crashed during the middle of a run. For continuing training a model on new data, see the ``fine-tune`` command. force : ``bool``, optional (default=False) If ``True``, we will overwrite the serialization directory if it already exists. cache_directory : ``str``, optional For caching data pre-processing. See :func:`allennlp.training.util.datasets_from_params`. cache_prefix : ``str``, optional For caching data pre-processing. See :func:`allennlp.training.util.datasets_from_params`. node_rank : ``int``, optional Rank of the current node in distributed training include_package : ``List[str]``, optional In distributed mode, extra packages mentioned will be imported in trainer workers. # Returns best_model : ``Model`` The model with the best epoch weights. """ create_serialization_dir(params, serialization_dir, recover, force) params.to_file(os.path.join(serialization_dir, CONFIG_NAME)) distributed_params = params.params.pop("distributed", None) # If distributed isn't in the config and the config contains strictly # one cuda device, we just run a single training process. if distributed_params is None: model = _train_worker( process_rank=0, params=params, serialization_dir=serialization_dir, file_friendly_logging=file_friendly_logging, recover=recover, cache_directory=cache_directory, cache_prefix=cache_prefix, include_package=include_package, ) archive_model(serialization_dir, files_to_archive=params.files_to_archive) return model # Otherwise, we are running multiple processes for training. else: # We are careful here so that we can raise a good error if someone # passed the wrong thing - cuda_devices are required. device_ids = distributed_params.pop("cuda_devices", None) multi_device = isinstance(device_ids, list) and len(device_ids) > 1 num_nodes = distributed_params.pop("num_nodes", 1) if not (multi_device or num_nodes > 1): raise ConfigurationError( "Multiple cuda devices/nodes need to be configured to run distributed training." ) check_for_gpu(device_ids) master_addr = distributed_params.pop("master_address", "127.0.0.1") master_port = distributed_params.pop("master_port", 29500) num_procs = len(device_ids) world_size = num_nodes * num_procs os.environ["MASTER_ADDR"] = master_addr os.environ["MASTER_PORT"] = str(master_port) os.environ["WORLD_SIZE"] = str(world_size) logging.info( f"Switching to distributed training mode since multiple GPUs are configured" f"Master is at: {master_addr}:{master_port} | Rank of this node: {node_rank} | " f"Number of workers in this node: {num_procs} | Number of nodes: {num_nodes} | " f"World size: {world_size}") # Creating `Vocabulary` objects from workers could be problematic since the data iterators # in each worker will yield only `rank` specific instances. Hence it is safe to construct # the vocabulary and write it to disk before initializing the distributed context. The workers # will load the vocabulary from the path specified. make_vocab_from_params(params.duplicate(), serialization_dir) params["vocabulary"] = { "directory_path": os.path.join(serialization_dir, "vocabulary"), "extend": False, # vocab extension would have been done above } mp.spawn( _train_worker, args=( params.duplicate(), serialization_dir, file_friendly_logging, recover, cache_directory, cache_prefix, include_package, node_rank, master_addr, master_port, world_size, device_ids, ), nprocs=num_procs, ) archive_model(serialization_dir, files_to_archive=params.files_to_archive) model = Model.load(params, serialization_dir) return model
def load_archive(archive_file: str, cuda_device: int = -1, overrides: str = "", weights_file: str = None) -> Archive: """ Instantiates an Archive from an archived `tar.gz` file. Parameters ---------- archive_file: ``str`` The archive file to load the model from. weights_file: ``str``, optional (default = None) The weights file to use. If unspecified, weights.th in the archive_file will be used. cuda_device: ``int``, optional (default = -1) If `cuda_device` is >= 0, the model will be loaded onto the corresponding GPU. Otherwise it will be loaded onto the CPU. overrides: ``str``, optional (default = "") JSON overrides to apply to the unarchived ``Params`` object. """ # redirect to the cache, if necessary resolved_archive_file = cached_path(archive_file) if resolved_archive_file == archive_file: logger.info(f"loading archive file {archive_file}") else: logger.info( f"loading archive file {archive_file} from cache at {resolved_archive_file}" ) tempdir = None if os.path.isdir(resolved_archive_file): serialization_dir = resolved_archive_file else: # Extract archive to temp dir tempdir = tempfile.mkdtemp() logger.info( f"extracting archive file {resolved_archive_file} to temp dir {tempdir}" ) with tarfile.open(resolved_archive_file, 'r:gz') as archive: archive.extractall(tempdir) serialization_dir = tempdir # Check for supplemental files in archive fta_filename = os.path.join(serialization_dir, _FTA_NAME) if os.path.exists(fta_filename): with open(fta_filename, 'r') as fta_file: files_to_archive = json.loads(fta_file.read()) # Add these replacements to overrides replacements_dict: Dict[str, Any] = {} for key, _ in files_to_archive.items(): replacement_filename = os.path.join(serialization_dir, f"fta/{key}") replacements_dict[key] = replacement_filename overrides_dict = parse_overrides(overrides) combined_dict = with_fallback(preferred=unflatten(replacements_dict), fallback=overrides_dict) overrides = json.dumps(combined_dict) # Load config config = Params.from_file(os.path.join(serialization_dir, CONFIG_NAME), overrides) config.loading_from_archive = True if weights_file: weights_path = weights_file else: weights_path = os.path.join(serialization_dir, _WEIGHTS_NAME) # Instantiate model. Use a duplicate of the config, as it will get consumed. model = Model.load(config.duplicate(), weights_file=weights_path, serialization_dir=serialization_dir, cuda_device=cuda_device) if tempdir: # Clean up temp dir shutil.rmtree(tempdir) return Archive(model=model, config=config)
def load_archive(archive_file: str, cuda_device: int = -1, overrides: str = "", weights_file: str = None) -> Archive: """ Instantiates an Archive from an archived `tar.gz` file. Parameters ---------- archive_file: ``str`` The archive file to load the model from. weights_file: ``str``, optional (default = None) The weights file to use. If unspecified, weights.th in the archive_file will be used. cuda_device: ``int``, optional (default = -1) If `cuda_device` is >= 0, the model will be loaded onto the corresponding GPU. Otherwise it will be loaded onto the CPU. overrides: ``str``, optional (default = "") JSON overrides to apply to the unarchived ``Params`` object. """ # redirect to the cache, if necessary resolved_archive_file = cached_path(archive_file) if resolved_archive_file == archive_file: logger.info(f"loading archive file {archive_file}") else: logger.info(f"loading archive file {archive_file} from cache at {resolved_archive_file}") if os.path.isdir(resolved_archive_file): serialization_dir = resolved_archive_file else: # Extract archive to temp dir tempdir = tempfile.mkdtemp() logger.info(f"extracting archive file {resolved_archive_file} to temp dir {tempdir}") with tarfile.open(resolved_archive_file, 'r:gz') as archive: archive.extractall(tempdir) # Postpone cleanup until exit in case the unarchived contents are needed outside # this function. atexit.register(_cleanup_archive_dir, tempdir) serialization_dir = tempdir # Check for supplemental files in archive fta_filename = os.path.join(serialization_dir, _FTA_NAME) if os.path.exists(fta_filename): with open(fta_filename, 'r') as fta_file: files_to_archive = json.loads(fta_file.read()) # Add these replacements to overrides replacements_dict: Dict[str, Any] = {} for key, _ in files_to_archive.items(): replacement_filename = os.path.join(serialization_dir, f"fta/{key}") replacements_dict[key] = replacement_filename overrides_dict = parse_overrides(overrides) combined_dict = with_fallback(preferred=unflatten(replacements_dict), fallback=overrides_dict) overrides = json.dumps(combined_dict) # Load config config = Params.from_file(os.path.join(serialization_dir, CONFIG_NAME), overrides) config.loading_from_archive = True if weights_file: weights_path = weights_file else: weights_path = os.path.join(serialization_dir, _WEIGHTS_NAME) # Instantiate model. Use a duplicate of the config, as it will get consumed. model = Model.load(config.duplicate(), weights_file=weights_path, serialization_dir=serialization_dir, cuda_device=cuda_device) return Archive(model=model, config=config)
def load_archive( archive_file: str, cuda_device: int = -1, overrides: str = "", weights_file: str = None, ) -> Archive: """ Instantiates an Archive from an archived `tar.gz` file. # Parameters archive_file : `str` The archive file to load the model from. cuda_device : `int`, optional (default = `-1`) If `cuda_device` is >= 0, the model will be loaded onto the corresponding GPU. Otherwise it will be loaded onto the CPU. overrides : `str`, optional (default = `""`) JSON overrides to apply to the unarchived `Params` object. weights_file : `str`, optional (default = `None`) The weights file to use. If unspecified, weights.th in the archive_file will be used. """ # redirect to the cache, if necessary resolved_archive_file = cached_path(archive_file) if resolved_archive_file == archive_file: logger.info(f"loading archive file {archive_file}") else: logger.info( f"loading archive file {archive_file} from cache at {resolved_archive_file}" ) tempdir = None try: if os.path.isdir(resolved_archive_file): serialization_dir = resolved_archive_file else: # Extract archive to temp dir tempdir = tempfile.mkdtemp() logger.info( f"extracting archive file {resolved_archive_file} to temp dir {tempdir}" ) with tarfile.open(resolved_archive_file, "r:gz") as archive: archive.extractall(tempdir) serialization_dir = tempdir # Load config config = Params.from_file(os.path.join(serialization_dir, CONFIG_NAME), overrides) if weights_file: weights_path = weights_file else: weights_path = os.path.join(serialization_dir, _WEIGHTS_NAME) # Fallback for serialization directories. if not os.path.exists(weights_path): weights_path = os.path.join(serialization_dir, _DEFAULT_WEIGHTS) # Instantiate model. Use a duplicate of the config, as it will get consumed. model = Model.load( config.duplicate(), weights_file=weights_path, serialization_dir=serialization_dir, cuda_device=cuda_device, ) finally: if tempdir is not None: logger.info( f"removing temporary unarchived model dir at {tempdir}") shutil.rmtree(tempdir, ignore_errors=True) return Archive(model=model, config=config)
def evaluate(model_path, test_path, config_path, metric, is_multiple_ref, max_count, report_every, batch_size): params_path = config_path or os.path.join(model_path, "config.json") params = Params.from_file(params_path) is_subwords = "tokenizer" in params["reader"] and params["reader"][ "tokenizer"]["type"] == "subword" reader = DatasetReader.from_params(params.pop("reader")) device = 0 if torch.cuda.is_available() else -1 model = Model.load(params, model_path, cuda_device=device) model.training = False print(model) print("Trainable params count: ", sum(p.numel() for p in model.parameters() if p.requires_grad)) hyps = [] refs = [] predictor = Seq2SeqPredictor(model, reader) for batch in get_batches(reader, test_path, batch_size): outputs = predictor.predict_batch_json(batch) targets = [b.get('target') for b in batch] for output, target in zip(outputs, targets): decoded_words = output["predicted_tokens"] if not is_multiple_ref: hyp = detokenize( " ".join(decoded_words)) if not is_subwords else "".join( decoded_words).replace("▁", " ") if len(hyp.strip()) <= 1: hyp = "empty" print("Empty hyp") if len(target.strip()) <= 1: target = "empty" print("Empty target") ref = [target] else: if isinstance(target, list): reference_sents = target elif isinstance(target, str): reference_sents = target.split(" s_s ") else: assert False decoded_sents = (" ".join(decoded_words)).split("s_s") hyp = [ w.replace("<", "<").replace(">", ">").strip() for w in decoded_sents ] ref = [ w.replace("<", "<").replace(">", ">").strip() for w in reference_sents ] hyp = " ".join(hyp) ref = [" ".join(ref)] hyps.append(hyp) refs.append(ref) if len(hyps) % report_every == 0: print("Count: ", len(hyps)) print("Ref: ", ref) print("Hyp: ", hyp) if metric in ("bleu", "all"): from nltk.translate.bleu_score import corpus_bleu print("BLEU: ", corpus_bleu(refs, hyps)) if metric in ("rouge", "all"): rouge = Rouge() scores = rouge.get_scores(hyps, [r[0] for r in refs], avg=True) print("ROUGE: ", scores) if max_count and len(hyps) >= max_count: break
def load_archive(archive_file: str, cuda_device: int = -1, overrides: str = "", weights_file: str = None) -> Archive: """ Instantiates an Archive from an archived `tar.gz` file. Parameters ---------- archive_file: ``str`` The archive file to load the model from. weights_file: ``str``, optional (default = None) The weights file to use. If unspecified, weights.th in the archive_file will be used. cuda_device: ``int``, optional (default = -1) If `cuda_device` is >= 0, the model will be loaded onto the corresponding GPU. Otherwise it will be loaded onto the CPU. overrides: ``str``, optional (default = "") JSON overrides to apply to the unarchived ``Params`` object. """ # redirect to the cache, if necessary resolved_archive_file = cached_path(archive_file) if resolved_archive_file == archive_file: logger.info(f"loading archive file {archive_file}") else: logger.info( f"loading archive file {archive_file} from cache at {resolved_archive_file}" ) if os.path.isdir(resolved_archive_file): serialization_dir = resolved_archive_file else: # Extract archive to temp dir tempdir = tempfile.mkdtemp() logger.info( f"extracting archive file {resolved_archive_file} to temp dir {tempdir}" ) with tarfile.open(resolved_archive_file, 'r:gz') as archive: archive.extractall(tempdir) # Postpone cleanup until exit in case the unarchived contents are needed outside # this function. atexit.register(_cleanup_archive_dir, tempdir) serialization_dir = tempdir # Check for supplemental files in archive fta_filename = os.path.join(serialization_dir, _FTA_NAME) if os.path.exists(fta_filename): with open(fta_filename, 'r', encoding='utf-8') as fta_file: files_to_archive = json.loads(fta_file.read()) # Add these replacements to overrides replacements_dict: Dict[str, Any] = {} for key, original_filename in files_to_archive.items(): replacement_filename = os.path.join(serialization_dir, f"fta/{key}") if os.path.exists(replacement_filename): replacements_dict[key] = replacement_filename else: logger.warning( f"Archived file {replacement_filename} not found! At train time " f"this file was located at {original_filename}. This may be " "because you are loading a serialization directory. Attempting to " "load the file from its train-time location.") overrides_dict = parse_overrides(overrides) combined_dict = with_fallback(preferred=overrides_dict, fallback=unflatten(replacements_dict)) overrides = json.dumps(combined_dict) # Load config config = Params.from_file(os.path.join(serialization_dir, CONFIG_NAME), overrides) config.loading_from_archive = True if weights_file: weights_path = weights_file else: weights_path = os.path.join(serialization_dir, _WEIGHTS_NAME) # Fallback for serialization directories. if not os.path.exists(weights_path): weights_path = os.path.join(serialization_dir, _DEFAULT_WEIGHTS) # Instantiate model. Use a duplicate of the config, as it will get consumed. model = Model.load(config.duplicate(), weights_file=weights_path, serialization_dir=serialization_dir, cuda_device=cuda_device) return Archive(model=model, config=config)
def train_model( params: Params, serialization_dir: str, file_friendly_logging: bool = False, recover: bool = False, force: bool = False, node_rank: int = 0, include_package: List[str] = None, batch_weight_key: str = "", ) -> Model: """ Trains the model specified in the given :class:`Params` object, using the data and training parameters also specified in that object, and saves the results in ``serialization_dir``. # Parameters params : ``Params`` A parameter object specifying an AllenNLP Experiment. serialization_dir : ``str`` The directory in which to save results and logs. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. recover : ``bool``, optional (default=False) If ``True``, we will try to recover a training run from an existing serialization directory. This is only intended for use when something actually crashed during the middle of a run. For continuing training a model on new data, see ``Model.from_archive``. force : ``bool``, optional (default=False) If ``True``, we will overwrite the serialization directory if it already exists. node_rank : ``int``, optional Rank of the current node in distributed training include_package : ``List[str]``, optional In distributed mode, extra packages mentioned will be imported in trainer workers. batch_weight_key : ``str``, optional (default="") If non-empty, name of metric used to weight the loss on a per-batch basis. # Returns best_model : ``Model`` The model with the best epoch weights. """ training_util.create_serialization_dir(params, serialization_dir, recover, force) params.to_file(os.path.join(serialization_dir, CONFIG_NAME)) distributed_params = params.params.pop("distributed", None) # If distributed isn't in the config and the config contains strictly # one cuda device, we just run a single training process. if distributed_params is None: model = _train_worker( process_rank=0, params=params, serialization_dir=serialization_dir, file_friendly_logging=file_friendly_logging, include_package=include_package, batch_weight_key=batch_weight_key, ) archive_model(serialization_dir) return model # Otherwise, we are running multiple processes for training. else: # We are careful here so that we can raise a good error if someone # passed the wrong thing - cuda_devices are required. device_ids = distributed_params.pop("cuda_devices", None) multi_device = isinstance(device_ids, list) and len(device_ids) > 1 num_nodes = distributed_params.pop("num_nodes", 1) if not (multi_device or num_nodes > 1): raise ConfigurationError( "Multiple cuda devices/nodes need to be configured to run distributed training." ) check_for_gpu(device_ids) master_addr = distributed_params.pop("master_address", "127.0.0.1") master_port = distributed_params.pop("master_port", 29500) num_procs = len(device_ids) world_size = num_nodes * num_procs logging.info( f"Switching to distributed training mode since multiple GPUs are configured" f"Master is at: {master_addr}:{master_port} | Rank of this node: {node_rank} | " f"Number of workers in this node: {num_procs} | Number of nodes: {num_nodes} | " f"World size: {world_size}") # Creating `Vocabulary` objects from workers could be problematic since # the data iterators in each worker will yield only `rank` specific # instances. Hence it is safe to construct the vocabulary and write it # to disk before initializing the distributed context. The workers will # load the vocabulary from the path specified. if params.get("vocabulary", Params({})).get("type", "") != "from_files": vocab = training_util.make_vocab_from_params( params.duplicate(), serialization_dir) params["vocabulary"] = { "type": "from_files", "directory": os.path.join(serialization_dir, "vocabulary"), "padding_token": vocab._padding_token, "oov_token": vocab._oov_token, } mp.spawn( _train_worker, args=( params.duplicate(), serialization_dir, file_friendly_logging, include_package, batch_weight_key, node_rank, master_addr, master_port, world_size, device_ids, ), nprocs=num_procs, ) archive_model(serialization_dir) model = Model.load(params, serialization_dir) return model
def load_archive(archive_file: str, cuda_device: int = -1, overrides: str = "", weights_file: str = None) -> Archive: """ Instantiates an Archive from an archived `tar.gz` file. Parameters ---------- archive_file: ``str`` The archive file to load the model from. weights_file: ``str``, optional (default = None) The weights file to use. If unspecified, weights.th in the archive_file will be used. cuda_device: ``int``, optional (default = -1) If `cuda_device` is >= 0, the model will be loaded onto the corresponding GPU. Otherwise it will be loaded onto the CPU. overrides: ``str``, optional (default = "") HOCON overrides to apply to the unarchived ``Params`` object. """ # redirect to the cache, if necessary archive_file = cached_path(archive_file) tempdir = None if os.path.isdir(archive_file): serialization_dir = archive_file else: # Extract archive to temp dir tempdir = tempfile.mkdtemp() logger.info("extracting archive file %s to temp dir %s", archive_file, tempdir) with tarfile.open(archive_file, 'r:gz') as archive: archive.extractall(tempdir) serialization_dir = tempdir # Check for supplemental files in archive fta_filename = os.path.join(serialization_dir, _FTA_NAME) if os.path.exists(fta_filename): with open(fta_filename, 'r') as fta_file: files_to_archive = json.loads(fta_file.read()) # Add these replacements to overrides replacement_hocon = pyhocon.ConfigTree(root=True) for key, _ in files_to_archive.items(): replacement_filename = os.path.join(serialization_dir, f"fta/{key}") replacement_hocon.put(key, replacement_filename) overrides_hocon = pyhocon.ConfigFactory.parse_string(overrides) combined_hocon = replacement_hocon.with_fallback(overrides_hocon) overrides = json.dumps(combined_hocon) # Load config config = Params.from_file(os.path.join(serialization_dir, CONFIG_NAME), overrides) config.loading_from_archive = True if weights_file: weights_path = weights_file else: weights_path = os.path.join(serialization_dir, _WEIGHTS_NAME) # Instantiate model. Use a duplicate of the config, as it will get consumed. model = Model.load(config.duplicate(), weights_file=weights_path, serialization_dir=serialization_dir, cuda_device=cuda_device) if tempdir: # Clean up temp dir shutil.rmtree(tempdir) return Archive(model=model, config=config)