def make_vocab_from_params(params: Params, serialization_dir: str): prepare_environment(params) vocab_params = params.pop("vocabulary", {}) os.makedirs(serialization_dir, exist_ok=True) vocab_dir = os.path.join(serialization_dir, "vocabulary") if os.path.isdir(vocab_dir) and os.listdir(vocab_dir) is not None: raise ConfigurationError("The 'vocabulary' directory in the provided " "serialization directory is non-empty") all_datasets = datasets_from_params(params) datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info("From dataset instances, %s will be considered for vocabulary creation.", ", ".join(datasets_for_vocab_creation)) instances = [instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation] vocab = Vocabulary.from_params(vocab_params, instances) logger.info(f"writing the vocabulary to {vocab_dir}.") vocab.save_to_files(vocab_dir) logger.info("done creating vocab")
def make_vocab_from_params(params: Params): prepare_environment(params) vocab_params = params.pop("vocabulary", {}) vocab_dir = vocab_params.get('directory_path') if vocab_dir is None: raise ConfigurationError("To use `make-vocab` your configuration must contain a value " "at vocabulary.directory_path") os.makedirs(vocab_dir, exist_ok=True) all_datasets = datasets_from_params(params) datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info("Creating a vocabulary using %s data.", ", ".join(datasets_for_vocab_creation)) vocab = Vocabulary.from_params(Params({}), (instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation)) vocab.save_to_files(vocab_dir) logger.info("done creating vocab")
def dry_run_from_params(params: Params, serialization_dir: str) -> None: prepare_environment(params) vocab_params = params.pop("vocabulary", {}) os.makedirs(serialization_dir, exist_ok=True) vocab_dir = os.path.join(serialization_dir, "vocabulary") if os.path.isdir(vocab_dir) and os.listdir(vocab_dir) is not None: raise ConfigurationError("The 'vocabulary' directory in the provided " "serialization directory is non-empty") all_datasets = datasets_from_params(params) datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info("From dataset instances, %s will be considered for vocabulary creation.", ", ".join(datasets_for_vocab_creation)) instances = [instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation] vocab = Vocabulary.from_params(vocab_params, instances) dataset = Batch(instances) dataset.index_instances(vocab) dataset.print_statistics() vocab.print_statistics() logger.info(f"writing the vocabulary to {vocab_dir}.") vocab.save_to_files(vocab_dir) model = Model.from_params(vocab=vocab, params=params.pop('model')) trainer_params = params.pop("trainer") no_grad_regexes = trainer_params.pop("no_grad", ()) for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad_regexes): parameter.requires_grad_(False) frozen_parameter_names, tunable_parameter_names = \ get_frozen_and_tunable_parameter_names(model) logger.info("Following parameters are Frozen (without gradient):") for name in frozen_parameter_names: logger.info(name) logger.info("Following parameters are Tunable (with gradient):") for name in tunable_parameter_names: logger.info(name)
def evaluate_from_args(args: argparse.Namespace) -> Dict[str, Any]: # Disable some of the more verbose logging statements logging.getLogger('allennlp.common.params').disabled = True logging.getLogger('allennlp.nn.initializers').disabled = True logging.getLogger('allennlp.modules.token_embedders.embedding').setLevel(logging.INFO) # Load from archive archive = load_archive(args.archive_file, args.cuda_device, args.overrides, args.weights_file) config = archive.config prepare_environment(config) model = archive.model model.eval() # Load the evaluation data # Try to use the validation dataset reader if there is one - otherwise fall back # to the default dataset_reader used for both training and validation. validation_dataset_reader_params = config.pop('validation_dataset_reader', None) if validation_dataset_reader_params is not None: dataset_reader = DatasetReader.from_params(validation_dataset_reader_params) else: dataset_reader = DatasetReader.from_params(config.pop('dataset_reader')) evaluation_data_path = args.input_file logger.info("Reading evaluation data from %s", evaluation_data_path) instances = dataset_reader.read(evaluation_data_path) iterator_params = config.pop("validation_iterator", None) if iterator_params is None: iterator_params = config.pop("iterator") iterator = DataIterator.from_params(iterator_params) iterator.index_with(model.vocab) metrics = evaluate(model, instances, iterator, args.cuda_device, args.batch_weight_key) logger.info("Finished evaluating.") logger.info("Metrics:") for key, metric in metrics.items(): logger.info("%s: %s", key, metric) output_file = args.output_file if output_file: with open(output_file, "w") as file: json.dump(metrics, file, indent=4) return metrics
def main(args: argparse.Namespace): for package_name in args.include_package: import_module_and_submodules(package_name) params = Params.from_file(args.param_path, args.overrides) random_seed, numpy_seed, pytorch_seed = 41, 11, 302 if not args.fix: random_seed, numpy_seed, pytorch_seed = random.randint( 0, 999999999), random.randint(0, 999999999), random.randint( 0, 999999999) params["random_seed"] = random_seed params["numpy_seed"] = numpy_seed params["pytorch_seed"] = pytorch_seed prepare_environment(params) serialization_dir = args.serialization_dir create_serialization_dir(params, serialization_dir, args.recover, args.force) prepare_global_logging(serialization_dir, args.file_friendly_logging) hyperparams = list( get_hyperparams(params.as_dict(infer_type_and_cast=True))) params.to_file(os.path.join(serialization_dir, CONFIG_NAME)) test_file = params.params.get("test_data_path", None) validation_data_path = params.get("validation_data_path", None) evaluate_on_test = params.pop_bool("evaluate_on_test", False) test_command = None if evaluate_on_test: test_command = BaseEvaluationCommand.from_params( params.pop("test_command")) cuda_device = params.params.get('trainer').get('cuda_device', -1) check_for_gpu(cuda_device) train_model = TrainPipelineModel.from_params( params=params, serialization_dir=serialization_dir, local_rank=0) trainer = train_model.trainer if trainer.validation_command is not None: trainer.validation_command.maybe_set_gold_file(validation_data_path) params.assert_empty('base train command') if args.comet is not None: experiment = Experiment(api_key=args.comet, workspace=args.workspace, project_name=args.project, parse_args=False, auto_output_logging=None) if args.tags: experiment.add_tags(args.tags) with open(args.param_path) as fil: code = "".join(fil.readlines()) code += "\n\n#=============Full details=============\n\n" full_details = _jsonnet.evaluate_file(args.param_path) code += full_details code += "\n\n#=============IMPORTANT: overwritten options============\n\n" code += args.overrides experiment.set_code(code, overwrite=True) for key, val in hyperparams: experiment.log_parameter(key, val) experiment.log_parameter("model_directory", serialization_dir) experiment.log_parameter("cuda_device", cuda_device) experiment.log_parameter("hostname", socket.gethostname()) experiment.log_parameter("random_seed", random_seed) experiment.log_parameter("numpy_seed", numpy_seed) experiment.log_parameter("pytorch_seed", pytorch_seed) else: experiment = None try: metrics = trainer.train(experiment) except KeyboardInterrupt: # if we have completed an epoch, try to create a model archive. if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)): logging.info( "Training interrupted by the user. Attempting to create " "a model archive using the current best epoch weights.") archive_model(serialization_dir) raise # Evaluate if test_file and evaluate_on_test: logger.info( "The model will be evaluated using the best epoch weights (see pred_test.txt)." ) trainer.annotator.annotate_file( trainer.model, test_file, os.path.join(serialization_dir, "pred_test.txt")) if test_command: logger.info("Comparing against gold standard.") test_command.maybe_set_gold_file(test_file) test_metrics = test_command.evaluate( os.path.join(serialization_dir, "pred_test.txt")) if experiment: with experiment.test(): experiment.log_metrics({ k: v for k, v in test_metrics.items() if np.isscalar(v) }) metrics = merge_dicts(metrics, "test", test_metrics) dump_metrics(os.path.join(serialization_dir, "metrics.json"), metrics, log=True) if not args.no_archive: # Now tar up results archive_model(serialization_dir)
def train_model(params: Params, serialization_dir: str, file_friendly_logging: bool = False, recover: bool = False, force: bool = False) -> Model: """ Trains the model specified in the given :class:`Params` object, using the data and training parameters also specified in that object, and saves the results in ``serialization_dir``. Parameters ---------- params : ``Params`` A parameter object specifying an AllenNLP Experiment. serialization_dir : ``str`` The directory in which to save results and logs. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. recover : ``bool``, optional (default=False) If ``True``, we will try to recover a training run from an existing serialization directory. This is only intended for use when something actually crashed during the middle of a run. For continuing training a model on new data, see the ``fine-tune`` command. Returns ------- best_model: ``Model`` The model with the best epoch weights. """ prepare_environment(params) create_serialization_dir(params, serialization_dir, recover, force) prepare_global_logging(serialization_dir, file_friendly_logging) cuda_device = params.params.get('trainer').get('cuda_device', -1) if isinstance(cuda_device, list): for device in cuda_device: check_for_gpu(device) else: check_for_gpu(cuda_device) params.to_file(os.path.join(serialization_dir, CONFIG_NAME)) all_datasets = datasets_from_params(params) datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info("From dataset instances, %s will be considered for vocabulary creation.", ", ".join(datasets_for_vocab_creation)) vocab = Vocabulary.from_params( params.pop("vocabulary", {}), (instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation) ) model = Model.from_params(vocab=vocab, params=params.pop('model')) # Initializing the model can have side effect of expanding the vocabulary vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(vocab) validation_iterator_params = params.pop("validation_iterator", None) if validation_iterator_params: validation_iterator = DataIterator.from_params(validation_iterator_params) validation_iterator.index_with(vocab) else: validation_iterator = None train_data = all_datasets['train'] validation_data = all_datasets.get('validation') test_data = all_datasets.get('test') trainer_params = params.pop("trainer") no_grad_regexes = trainer_params.pop("no_grad", ()) for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad_regexes): parameter.requires_grad_(False) frozen_parameter_names, tunable_parameter_names = \ get_frozen_and_tunable_parameter_names(model) logger.info("Following parameters are Frozen (without gradient):") for name in frozen_parameter_names: logger.info(name) logger.info("Following parameters are Tunable (with gradient):") for name in tunable_parameter_names: logger.info(name) trainer_choice = trainer_params.pop_choice("type", Trainer.list_available(), default_to_first_choice=True) trainer = Trainer.by_name(trainer_choice).from_params(model=model, serialization_dir=serialization_dir, iterator=iterator, train_data=train_data, validation_data=validation_data, params=trainer_params, validation_iterator=validation_iterator) evaluate_on_test = params.pop_bool("evaluate_on_test", False) params.assert_empty('base train command') try: metrics = trainer.train() except KeyboardInterrupt: # if we have completed an epoch, try to create a model archive. if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)): logging.info("Training interrupted by the user. Attempting to create " "a model archive using the current best epoch weights.") archive_model(serialization_dir, files_to_archive=params.files_to_archive) raise # Now tar up results archive_model(serialization_dir, files_to_archive=params.files_to_archive) logger.info("Loading the best epoch weights.") best_model_state_path = os.path.join(serialization_dir, 'best.th') best_model_state = torch.load(best_model_state_path) best_model = model best_model.load_state_dict(best_model_state) if test_data and evaluate_on_test: logger.info("The model will be evaluated using the best epoch weights.") test_metrics = evaluate( best_model, test_data, validation_iterator or iterator, cuda_device=trainer._cuda_devices[0] # pylint: disable=protected-access ) for key, value in test_metrics.items(): metrics["test_" + key] = value elif test_data: logger.info("To evaluate on the test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.") dump_metrics(os.path.join(serialization_dir, "metrics.json"), metrics, log=True) return best_model
def train_model(params: Params, serialization_dir: str) -> Model: """ This function can be used as an entry point to running models in AllenNLP directly from a JSON specification using a :class:`Driver`. Note that if you care about reproducibility, you should avoid running code using Pytorch or numpy which affect the reproducibility of your experiment before you import and use this function, these libraries rely on random seeds which can be set in this function via a JSON specification file. Note that this function performs training and will also evaluate the trained model on development and test sets if provided in the parameter json. Parameters ---------- params: Params, required. A parameter object specifying an AllenNLP Experiment. serialization_dir: str, required The directory in which to save results and logs. """ prepare_environment(params) os.makedirs(serialization_dir, exist_ok=True) sys.stdout = TeeLogger(os.path.join(serialization_dir, "stdout.log"), sys.stdout) # type: ignore sys.stderr = TeeLogger(os.path.join(serialization_dir, "stderr.log"), sys.stderr) # type: ignore handler = logging.FileHandler(os.path.join(serialization_dir, "python_logging.log")) handler.setLevel(logging.INFO) handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(name)s - %(message)s')) logging.getLogger().addHandler(handler) serialization_params = deepcopy(params).as_dict(quiet=True) with open(os.path.join(serialization_dir, "model_params.json"), "w") as param_file: json.dump(serialization_params, param_file, indent=4) # Now we begin assembling the required parts for the Trainer. dataset_reader = DatasetReader.from_params(params.pop('dataset_reader')) train_data_path = params.pop('train_data_path') logger.info("Reading training data from %s", train_data_path) train_data = dataset_reader.read(train_data_path) all_datasets: Dict[str, Dataset] = {"train": train_data} validation_data_path = params.pop('validation_data_path', None) if validation_data_path is not None: logger.info("Reading validation data from %s", validation_data_path) validation_data = dataset_reader.read(validation_data_path) all_datasets["validation"] = validation_data else: validation_data = None test_data_path = params.pop("test_data_path", None) if test_data_path is not None: logger.info("Reading test data from %s", test_data_path) test_data = dataset_reader.read(test_data_path) all_datasets["test"] = test_data else: test_data = None datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info("Creating a vocabulary using %s data.", ", ".join(datasets_for_vocab_creation)) vocab = Vocabulary.from_params(params.pop("vocabulary", {}), Dataset([instance for key, dataset in all_datasets.items() for instance in dataset.instances if key in datasets_for_vocab_creation])) vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) model = Model.from_params(vocab, params.pop('model')) iterator = DataIterator.from_params(params.pop("iterator")) train_data.index_instances(vocab) if validation_data: validation_data.index_instances(vocab) trainer_params = params.pop("trainer") trainer = Trainer.from_params(model, serialization_dir, iterator, train_data, validation_data, trainer_params) evaluate_on_test = params.pop_bool("evaluate_on_test", False) params.assert_empty('base train command') trainer.train() # Now tar up results archive_model(serialization_dir, files_to_archive=params.files_to_archive) if test_data and evaluate_on_test: test_data.index_instances(vocab) evaluate(model, test_data, iterator, cuda_device=trainer._cuda_device) # pylint: disable=protected-access elif test_data: logger.info("To evaluate on the test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.") return model
def _train_worker( process_rank: int, params: Params, serialization_dir: str, file_friendly_logging: bool = False, recover: bool = False, include_package: List[str] = None, batch_weight_key: str = "", node_rank: int = 0, master_addr: str = "127.0.0.1", master_port: int = 29500, world_size: int = 1, distributed_device_ids: List[str] = None, # For fine-tuning: model: Model = None, extend_vocab: bool = False, embedding_sources_mapping: Dict[str, str] = None, ) -> Optional[Model]: """ Helper to train the configured model/experiment. In distributed mode, this is spawned as a worker process. In a single GPU experiment, this returns the ``Model`` object and in distributed training, nothing is returned. # Parameters process_rank : ``int`` The process index that is initialized using the GPU device id. params : ``Params`` A parameter object specifying an AllenNLP Experiment. serialization_dir : ``str`` The directory in which to save results and logs. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. recover : ``bool``, optional (default=False) If ``True``, we will try to recover a training run from an existing serialization directory. This is only intended for use when something actually crashed during the middle of a run. For continuing training a model on new data, see the ``fine-tune`` command. include_package : ``List[str]``, optional In distributed mode, since this function would have been spawned as a separate process, the extra imports need to be done again. NOTE: This does not have any effect in single GPU training. node_rank : ``int``, optional Rank of the node world_size : ``int``, optional The number of processes involved in distributed training. # Returns best_model : ``Model`` The model with the best epoch weights. """ common_util.prepare_global_logging( serialization_dir, file_friendly_logging, rank=process_rank, world_size=world_size ) common_util.prepare_environment(params) distributed = world_size > 1 # not using `allennlp.common.util.is_master` as the process group is yet to be initialized master = process_rank == 0 if distributed: # Since the worker is spawned and not forked, the extra imports # need to be done again. if include_package is not None: for package_name in include_package: common_util.import_submodules(package_name) num_procs_per_node = len(distributed_device_ids) # The Unique identifier of the worker process among all the processes in the # distributed training group is computed here. This is used while initializing # the process group using `init_process_group` global_rank = node_rank * num_procs_per_node + process_rank # Number of processes per node is useful to know if a process # is a master in the local node(node in which it is running) os.environ["ALLENNLP_PROCS_PER_NODE"] = str(num_procs_per_node) # In distributed training, the configured device is always going to be a list. # The corresponding gpu id for the particular worker is obtained by picking the id # from the device list with the rank as index gpu_id = distributed_device_ids[process_rank] # type: ignore # Till now, "cuda_device" might not be set in the trainer params. # But a worker trainer needs to only know about its specific GPU id. params["trainer"]["cuda_device"] = gpu_id params["trainer"]["world_size"] = world_size params["trainer"]["distributed"] = True torch.cuda.set_device(int(gpu_id)) dist.init_process_group( backend="nccl", init_method=f"tcp://{master_addr}:{master_port}", world_size=world_size, rank=global_rank, ) logging.info( f"Process group of world size {world_size} initialized " f"for distributed training in worker {global_rank}" ) train_loop = TrainModel.from_params( params=params, serialization_dir=serialization_dir, local_rank=process_rank, batch_weight_key=batch_weight_key, ) try: if distributed: # let the setup get ready for all the workers dist.barrier() metrics = train_loop.run() except KeyboardInterrupt: # if we have completed an epoch, try to create a model archive. if master and os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)): logging.info( "Training interrupted by the user. Attempting to create " "a model archive using the current best epoch weights." ) archive_model(serialization_dir) raise if master: train_loop.finish(metrics) if not distributed: return train_loop.model return None # to make mypy happy
def evaluate_from_args(args: argparse.Namespace) -> Dict[str, Any]: # Disable some of the more verbose logging statements logging.getLogger('allennlp.common.params').disabled = True logging.getLogger('allennlp.nn.initializers').disabled = True logging.getLogger('allennlp.modules.token_embedders.embedding').setLevel( logging.INFO) # Load from archive archive = load_archive(args.archive_file, args.cuda_device, args.overrides) config = archive.config prepare_environment(config) model = archive.model model.eval() # Load the evaluation data dataset_reader_config = config.pop('dataset_reader') if "evaluator_type" in config: eval_type = config.get("evaluator_type") else: dataset_reader_type = dataset_reader_config.get("type") eval_type = dataset_reader_type dataset_reader = DatasetReader.from_params(dataset_reader_config) evaluation_data_paths_list = [] evaluation_data_short_names = [] output_files_list = args.output_file.split(";") if args.evaluation_data_file: evaluation_data_paths_list.append(args.evaluation_data_file) evaluation_data_short_names.append("input") else: if "validation_data_path" in config: evaluation_data_paths_list.append(config["validation_data_path"]) evaluation_data_short_names.append("dev") if "test_data_path" in config: evaluation_data_paths_list.append(config["test_data_path"]) evaluation_data_short_names.append("test") metrics_out = {} iterator = DataIterator.from_params(config.pop("iterator")) iterator.index_with(model.vocab) for i in range(len(evaluation_data_paths_list)): evaluation_data_path = evaluation_data_paths_list[i] evaluation_data_short_name = evaluation_data_path if len(evaluation_data_short_names) - 1 < i \ else evaluation_data_short_names[i] if len(output_files_list) == len(evaluation_data_paths_list): out_file = output_files_list[i] else: out_file = "{0}_{1}.txt".format(output_files_list[0], evaluation_data_short_name) logger.info("Reading evaluation data from %s", evaluation_data_path) dataset = dataset_reader.read(evaluation_data_path) metrics = evaluate(model, dataset, iterator, out_file, eval_type) if out_file is not None: logging.info("Predictions exported to {0}".format(out_file)) logger.info("Finished evaluating.") logger.info("Metrics:") for key, metric in metrics.items(): logger.info("%s: %s", key, metric) if len(evaluation_data_paths_list) == 1: metrics_out = metrics else: metrics_out[evaluation_data_short_name] = metrics return metrics_out
def main(serialization_dir, evaluation_data_file, split, cuda_device, weights_file, overrides): archive_file = os.path.join(serialization_dir, "model.tar.gz") logging_dir = os.path.join(serialization_dir, "logging") if os.path.isfile(archive_file): weights_file = None archive = load_archive(archive_file, cuda_device, overrides, weights_file) config = archive.config prepare_environment(config) prepare_global_logging(logging_dir, file_friendly_logging=False, file_name=split) model = archive.model else: # Load config config = Params.from_file(os.path.join(serialization_dir, CONFIG_NAME), overrides) prepare_environment(config) prepare_global_logging(logging_dir, file_friendly_logging=False, file_name=split) if weights_file: weights_path = os.path.join(serialization_dir, weights_file) else: weights_path = os.path.join(serialization_dir, _WEIGHTS_NAME) logger.info("Using weights_file located at : %s", weights_path) # Instantiate model. Use a duplicate of the config, as it will get consumed. model = Model.load(config.duplicate(), weights_file=weights_path, serialization_dir=serialization_dir, cuda_device=cuda_device) # Eval mode ON model.eval() # Load the evaluation data # Try to use the validation dataset reader if there is one - otherwise fall back # to the default dataset_reader used for both training and validation. validation_dataset_reader_params = config.pop('validation_dataset_reader', None) if validation_dataset_reader_params is not None: dataset_reader = DatasetReader.from_params(validation_dataset_reader_params) else: dataset_reader = DatasetReader.from_params(config.pop('dataset_reader')) if evaluation_data_file is None: logger.info("--evaluation_data_file not provided. So using --split=%s to read data", split) data_path_key = split + '_data_path' evaluation_data_path = config.pop(data_path_key) else: evaluation_data_path = evaluation_data_file logger.info("Reading evaluation data from %s", evaluation_data_path) instances = dataset_reader.read(evaluation_data_path) logger.info("No. of instances = %d", len(instances)) iterator = BasicIterator(batch_size=128) iterator.index_with(model.vocab) metrics, model_predictions = get_model_predictions(model, instances, iterator, args.cuda_device) logger.info("Finished evaluating.") logger.info("Metrics:") for key, metric in metrics.items(): logger.info("%s: %s", key, metric) write_predictions(serialization_dir=serialization_dir, instances=instances, model_predictions=model_predictions, split=split) analyze_gold_data(serialization_dir=serialization_dir, instances=instances, split=split) analyze_model_predictions(serialization_dir=serialization_dir, instances=instances, model_predictions=model_predictions, split=split) analyze_bio_violations(instances=instances, model_predictions=model_predictions)
def _train_worker( process_rank: int, params: Params, serialization_dir: str, include_package: List[str] = None, dry_run: bool = False, node_rank: int = 0, master_addr: str = "127.0.0.1", master_port: int = 29500, world_size: int = 1, distributed_device_ids: List[int] = None, ) -> Optional[Model]: """ Helper to train the configured model/experiment. In distributed mode, this is spawned as a worker process. In a single GPU experiment, this returns the `Model` object and in distributed training, nothing is returned. # Parameters process_rank : `int` The process index that is initialized using the GPU device id. params : `Params` A parameter object specifying an AllenNLP Experiment. serialization_dir : `str` The directory in which to save results and logs. include_package : `List[str]`, optional In distributed mode, since this function would have been spawned as a separate process, the extra imports need to be done again. NOTE: This does not have any effect in single GPU training. dry_run : `bool`, optional (default=`False`) Do not train a model, but create a vocabulary, show dataset statistics and other training information. node_rank : `int`, optional Rank of the node. master_addr : `str`, optional (default=`"127.0.0.1"`) Address of the master node for distributed training. master_port : `str`, optional (default=`"29500"`) Port of the master node for distributed training. world_size : `int`, optional The number of processes involved in distributed training. distributed_device_ids: `List[str]`, optional IDs of the devices used involved in distributed training. # Returns best_model : `Optional[Model]` The model with the best epoch weights or `None` if in distributed training or in dry run. """ prepare_global_logging( serialization_dir, rank=process_rank, world_size=world_size, ) common_util.prepare_environment(params) distributed = world_size > 1 # not using `allennlp.common.util.is_master` as the process group is yet to be initialized master = process_rank == 0 include_package = include_package or [] if distributed: # Since the worker is spawned and not forked, the extra imports need to be done again. # Both the ones from the plugins and the ones from `include_package`. import_plugins() for package_name in include_package: common_util.import_module_and_submodules(package_name) num_procs_per_node = len(distributed_device_ids) # The Unique identifier of the worker process among all the processes in the # distributed training group is computed here. This is used while initializing # the process group using `init_process_group` global_rank = node_rank * num_procs_per_node + process_rank # Number of processes per node is useful to know if a process # is a master in the local node(node in which it is running) os.environ["ALLENNLP_PROCS_PER_NODE"] = str(num_procs_per_node) # In distributed training, the configured device is always going to be a list. # The corresponding gpu id for the particular worker is obtained by picking the id # from the device list with the rank as index gpu_id = distributed_device_ids[process_rank] # type: ignore # Till now, "cuda_device" might not be set in the trainer params. # But a worker trainer needs to only know about its specific GPU id. params["trainer"]["cuda_device"] = gpu_id params["trainer"]["world_size"] = world_size params["trainer"]["distributed"] = True if gpu_id >= 0: torch.cuda.set_device(int(gpu_id)) dist.init_process_group( backend="nccl", init_method=f"tcp://{master_addr}:{master_port}", world_size=world_size, rank=global_rank, ) else: dist.init_process_group( backend="gloo", init_method=f"tcp://{master_addr}:{master_port}", world_size=world_size, rank=global_rank, ) logging.info( f"Process group of world size {world_size} initialized " f"for distributed training in worker {global_rank}" ) train_loop = TrainModel.from_params( params=params, serialization_dir=serialization_dir, local_rank=process_rank, ) if dry_run: return None try: if distributed: # let the setup get ready for all the workers dist.barrier() metrics = train_loop.run() except KeyboardInterrupt: # if we have completed an epoch, try to create a model archive. if master and os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)): logging.info( "Training interrupted by the user. Attempting to create " "a model archive using the current best epoch weights." ) archive_model(serialization_dir) raise if master: train_loop.finish(metrics) if not distributed: return train_loop.model return None
def train_model(params: Params, serialization_dir: str, file_friendly_logging: bool = False, recover: bool = False) -> Model: """ Trains the model specified in the given :class:`Params` object, using the data and training parameters also specified in that object, and saves the results in ``serialization_dir``. Parameters ---------- params : ``Params`` A parameter object specifying an AllenNLP Experiment. serialization_dir : ``str`` The directory in which to save results and logs. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. recover : ``bool``, optional (default=False) If ``True``, we will try to recover a training run from an existing serialization directory. This is only intended for use when something actually crashed during the middle of a run. For continuing training a model on new data, see the ``fine-tune`` command. Returns ------- best_model: ``Model`` The model with the best epoch weights. """ prepare_environment(params) create_serialization_dir(params, serialization_dir, recover) prepare_global_logging(serialization_dir, file_friendly_logging) check_for_gpu(params.params.get('trainer').get('cuda_device', -1)) serialization_params = deepcopy(params).as_dict(quiet=True) with open(os.path.join(serialization_dir, CONFIG_NAME), "w") as param_file: json.dump(serialization_params, param_file, indent=4) all_datasets = datasets_from_params(params) datasets_for_vocab_creation = set( params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError( f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info("Creating a vocabulary using %s data.", ", ".join(datasets_for_vocab_creation)) vocab = RegistrableVocabulary.from_params( params.pop("vocabulary", {"type": "vocabulary"}), (instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation)) vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) model = Model.from_params(vocab, params.pop('model')) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(vocab) train_data = all_datasets['train'] validation_data = all_datasets.get('validation') test_data = all_datasets.get('test') trainer_params = params.pop("trainer") no_grad_regexes = trainer_params.pop("no_grad", ()) nograd_parameter_names = [] grad_parameter_names = [] for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad_regexes): parameter.requires_grad_(False) nograd_parameter_names.append(name) else: grad_parameter_names.append(name) logger.info("Following parameters are Frozen (without gradient):") for name in nograd_parameter_names: logger.info(name) logger.info("Following parameters are Tunable (with gradient):") for name in grad_parameter_names: logger.info(name) trainer = Trainer.from_params(model, serialization_dir, iterator, train_data, validation_data, trainer_params) evaluate_on_test = params.pop_bool("evaluate_on_test", False) params.assert_empty('base train command') try: metrics = trainer.train() except KeyboardInterrupt: # if we have completed an epoch, try to create a model archive. if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)): logging.info( "Training interrupted by the user. Attempting to create " "a model archive using the current best epoch weights.") archive_model(serialization_dir, files_to_archive=params.files_to_archive) raise # Now tar up results archive_model(serialization_dir, files_to_archive=params.files_to_archive) logger.info("Loading the best epoch weights.") best_model_state_path = os.path.join(serialization_dir, 'best.th') best_model_state = torch.load(best_model_state_path) best_model = model best_model.load_state_dict(best_model_state) if test_data and evaluate_on_test: logger.info( "The model will be evaluated using the best epoch weights.") test_metrics = evaluate(best_model, test_data, iterator, cuda_device=trainer._cuda_devices[0]) # pylint: disable=protected-access for key, value in test_metrics.items(): metrics["test_" + key] = value elif test_data: logger.info( "To evaluate on the test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.") metrics_json = json.dumps(metrics, indent=2) with open(os.path.join(serialization_dir, "metrics.json"), "w") as metrics_file: metrics_file.write(metrics_json) logger.info("Metrics: %s", metrics_json) return best_model
def _train_worker( process_rank: int, params: Params, serialization_dir: str, file_friendly_logging: bool = False, recover: bool = False, include_package: List[str] = None, batch_weight_key: str = "", node_rank: int = 0, master_addr: str = "127.0.0.1", master_port: int = 29500, world_size: int = 1, distributed_device_ids: List[str] = None, # For fine-tuning: model: Model = None, extend_vocab: bool = False, embedding_sources_mapping: Dict[str, str] = None, ) -> Optional[Model]: """ Helper to train the configured model/experiment. In distributed mode, this is spawned as a worker process. In a single GPU experiment, this returns the ``Model`` object and in distributed training, nothing is returned. # Parameters process_rank : ``int`` The process index that is initialized using the GPU device id. params : ``Params`` A parameter object specifying an AllenNLP Experiment. serialization_dir : ``str`` The directory in which to save results and logs. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. recover : ``bool``, optional (default=False) If ``True``, we will try to recover a training run from an existing serialization directory. This is only intended for use when something actually crashed during the middle of a run. For continuing training a model on new data, see the ``fine-tune`` command. include_package : ``List[str]``, optional In distributed mode, since this function would have been spawned as a separate process, the extra imports need to be done again. NOTE: This does not have any effect in single GPU training. node_rank : ``int``, optional Rank of the node world_size : ``int``, optional The number of processes involved in distributed training. # Returns best_model : ``Model`` The model with the best epoch weights. """ prepare_global_logging(serialization_dir, file_friendly_logging, rank=process_rank, world_size=world_size) prepare_environment(params) distributed = world_size > 1 # not using `allennlp.common.util.is_master` as the process group is yet to be initialized master = process_rank == 0 evaluate_on_test = params.pop_bool("evaluate_on_test", False) if distributed: # Since the worker is spawned and not forked, the extra imports # need to be done again. if include_package is not None: for package_name in include_package: import_submodules(package_name) num_procs_per_node = len(distributed_device_ids) # The Unique identifier of the worker process among all the processes in the # distributed training group is computed here. This is used while initializing # the process group using `init_process_group` global_rank = node_rank * num_procs_per_node + process_rank # Number of processes per node is useful to know if a process # is a master in the local node(node in which it is running) os.environ["ALLENNLP_PROCS_PER_NODE"] = str(num_procs_per_node) # In distributed training, the configured device is always going to be a list. # The corresponding gpu id for the particular worker is obtained by picking the id # from the device list with the rank as index gpu_id = distributed_device_ids[process_rank] # type: ignore # Till now, "cuda_device" might not be set in the trainer params. # But a worker trainer needs to only know about its specific GPU id. params["trainer"]["cuda_device"] = gpu_id params["trainer"]["world_size"] = world_size params["trainer"]["distributed"] = True torch.cuda.set_device(int(gpu_id)) dist.init_process_group( backend="nccl", init_method=f"tcp://{master_addr}:{master_port}", world_size=world_size, rank=global_rank, ) logging.info(f"Process group of world size {world_size} initialized " f"for distributed training in worker {global_rank}") trainer_type = params.get("trainer", {}).get("type", "default") if trainer_type == "default": # Special logic to instantiate backward-compatible trainer. pieces = TrainerPieces.from_params( params=params, serialization_dir=serialization_dir, recover=recover, model=model, embedding_sources_mapping=embedding_sources_mapping, extend_vocab=extend_vocab, ) trainer = Trainer.from_params( model=pieces.model, serialization_dir=serialization_dir, iterator=pieces.iterator, train_data=pieces.train_dataset, validation_data=pieces.validation_dataset, params=pieces.params, validation_iterator=pieces.validation_iterator, local_rank=process_rank, ) evaluation_iterator = pieces.validation_iterator or pieces.iterator evaluation_dataset = pieces.test_dataset else: if evaluate_on_test: raise ValueError( "--evaluate-on-test only works with the default Trainer. " "If you're using the CallbackTrainer you can use a callback " "to evaluate at Events.TRAINING_END; otherwise you'll have " "to run allennlp evaluate separately.") trainer = TrainerBase.from_params(params, serialization_dir, recover) evaluation_dataset = None evaluation_iterator = None params.assert_empty("base train command") try: if distributed: # let the setup get ready for all the workers dist.barrier() metrics = trainer.train() except KeyboardInterrupt: # if we have completed an epoch, try to create a model archive. if master and os.path.exists( os.path.join(serialization_dir, _DEFAULT_WEIGHTS)): logging.info( "Training interrupted by the user. Attempting to create " "a model archive using the current best epoch weights.") archive_model(serialization_dir, files_to_archive=params.files_to_archive) raise if master: if evaluation_dataset and evaluate_on_test: logger.info( "The model will be evaluated using the best epoch weights.") test_metrics = evaluate( trainer.model, evaluation_dataset, evaluation_iterator, cuda_device=trainer.cuda_device, batch_weight_key=batch_weight_key, ) for key, value in test_metrics.items(): metrics["test_" + key] = value elif evaluation_dataset: logger.info( "To evaluate on the test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command." ) dump_metrics(os.path.join(serialization_dir, "metrics.json"), metrics, log=True) if not distributed: return trainer.model return None # to make mypy happy
help='a JSON structure used to override the experiment configuration') parser.add_argument( '--file-friendly-logging', action='store_true', default=False, help='outputs tqdm status on separate lines and slows tqdm refresh rate') args = parser.parse_args() params = Params.from_file(args.param_path, args.overrides) random_seed, numpy_seed, pytorch_seed = random.randint( 0, 999999999), random.randint(0, 999999999), random.randint(0, 999999999) params["random_seed"] = random_seed params["numpy_seed"] = numpy_seed params["pytorch_seed"] = pytorch_seed prepare_environment(params) from graph_dependency_parser.components.evaluation.predictors import Evaluator, EmptyMRPEvaluator from graph_dependency_parser.train.amtrainer import AMTrainer, TrainerPieces serialization_dir = args.serialization_dir create_serialization_dir(params, serialization_dir, args.recover, args.force) stdout_handler = prepare_global_logging(serialization_dir, args.file_friendly_logging) cuda_device = params.params.get('trainer').get('cuda_device', -1) check_for_gpu(cuda_device) params.to_file(os.path.join(serialization_dir, CONFIG_NAME)) evaluate_on_test = params.pop_bool("evaluate_on_test", False) test_evaluators = params.pop("test_evaluators", [])
def evaluate_from_args(args: argparse.Namespace) -> Dict[str, Any]: # Disable some of the more verbose logging statements logging.getLogger('allennlp.common.params').disabled = True logging.getLogger('allennlp.nn.initializers').disabled = True logging.getLogger('allennlp.modules.token_embedders.embedding').setLevel( logging.INFO) logging.info("Parameters:") for arg in vars(args): logging.info("{0}: {1}".format(arg, getattr(args, arg))) # Load from archive cuda_device = args.cuda_device logging.info("cuda_device:{0}".format(cuda_device)) archive = load_archive(args.archive_file, cuda_device=cuda_device, overrides=args.overrides) config = archive.config prepare_environment(config) model = archive.model model.eval() # Load the evaluation data dataset_reader = DatasetReader.from_params( config.pop('validation_dataset_reader') if "validation_dataset_reader" in config else config.pop('dataset_reader')) evaluation_data_path = args.evaluation_data_file logger.info("Reading evaluation data from %s", evaluation_data_path) batch_size = args.batch_size start_id = args.start_id end_id = args.end_id dataset = dataset_reader.read(evaluation_data_path) file_mode = args.file_open_mode if start_id > 0 or end_id > 0: if not isinstance(dataset, list): raise ValueError( "dataset must be list when start_id and end_id are set") start_id = max(start_id, 0) if end_id <= 0: end_id = len(dataset) dataset = dataset[start_id:end_id] iterator_config = config.pop( 'validation_iterator' ) if "validation_iterator" in config else config.pop('iterator') if batch_size > -1: if "base_iterator" in iterator_config: iterator_config["base_iterator"]["batch_size"] = batch_size else: iterator_config["batch_size"] = batch_size iterator = DataIterator.from_params(iterator_config) iterator.index_with(model.vocab) metrics = evaluate(model, dataset, iterator, args.output_file, file_mode=file_mode) if args.output_file: absolute_path = os.path.abspath(args.output_file) logging.info("Output saved to \n{}".format(absolute_path)) logger.info("Finished evaluating.") logger.info("Metrics:") for key, metric in metrics.items(): logger.info("%s: %s", key, metric) return metrics
def main(): parse = argparse.ArgumentParser("") parse.add_argument("command", type=str, help="one of the following options train, evaluate, generalize") parse.add_argument("--datasets", type=str, help="", default=None) parse.add_argument("--model", type=str, help="", default=None) parse.add_argument("--serialization_dir", type=str, help="the directory storing the intermediate files and output", default=None) parse.add_argument("--cuda_device", type=str, help="Cuda device ID", default="-1") parse.add_argument("--split", type=str, help="dev / test", default="dev") parse.add_argument("--bert_type", type=str, help="Base / Large /", default="Base") parse.add_argument("--config", type=str, help="dev / test", default=None) parse.add_argument("--output_path", type=str, help="directory to which results JSONs of eval will written", default='results/eval/') parse.add_argument("--models_dir", type=str, help="directory containing the models used for eval , (please add '/' at the end)", default=None) parse.add_argument("--data_dir", type=str, help="directory containing the multiqa format datasets , (please add '/' at the end and make sure to have a headers directory with all headers under your specified path)", default='https://multiqa.s3.amazonaws.com/data/') parse.add_argument("--t_total", type=str, help="used for training, see BERT's learning rate schedule for details", default=None) parse.add_argument("--sample_size", type=str, help="used for sampling a subset of the training data", default=-1) parse.add_argument("--validation_sample_size", type=str, help="used for sampling a subset of the training data", default=-1) parse.add_argument("--batch_size", type=str, help="the batch size", default=8) parse.add_argument("--max_instances_in_memory", type=str, help="max number instances in memrory during training", default=5000) parse.add_argument("--num_epochs", type=str, help="", default=2) parse.add_argument("--lr", type=str, help="learning rate", default=0.00003) args = parse.parse_args() import_submodules("models") # TODO add best config for specific datasets as default, not one general default... if args.config is None: config = 'models/MultiQA_BERT' + args.bert_type + '.jsonnet' else: config = args.config config_params = Params(json.loads(_jsonnet.evaluate_file(config))) if args.command == 'train': # building the default dataset urls train_datasets = [args.data_dir + dataset + '_train.jsonl.gz' for dataset in args.datasets.split(',')] val_datasets = [args.data_dir + dataset + '_' + args.split + '.jsonl.gz' for dataset in args.datasets.split(',')] # calculating the t_total if args.t_total == None: logging.info('getting headers of the chosen dataset in order to compute learning rate schedule t_total') total_number_of_examples = 0 for header_url in [args.data_dir + 'headers/' + dataset + '_train.json' for dataset in args.datasets.split(',')]: with open(cached_path(header_url),'r') as f: header = json.load(f) total_number_of_examples += header['number_of_qas'] t_total = int(total_number_of_examples / float(config_params['iterator']['batch_size']) \ * float(config_params['trainer']['num_epochs'])) \ / len(args.cuda_device.split(',')) if args.serialization_dir is None: serialization_dir = 'models/' + args.datasets.replace(',','_') + f"num_epochs_{args.num_epochs}_batch_size_{args.batch_size}_lr_{args.lr}" else: serialization_dir = args.serialization_dir print(" >>>>>>>> overriding the parameters <<<<<<<<<<< ") overrides = { 'train_data_path': ','.join(train_datasets), 'validation_data_path': ','.join(val_datasets), 'dataset_reader': { 'sample_size': args.sample_size, }, 'validation_dataset_reader': { 'sample_size': args.validation_sample_size, }, 'iterator': { 'batch_size': args.batch_size, 'max_instances_in_memory': args.max_instances_in_memory, }, 'trainer': { 'cuda_device': args.cuda_device, 'num_epochs': args.num_epochs, 'optimizer': { 't_total': t_total, 'lr': args.lr, } } } overrides_str = str(overrides).replace('True', 'true').replace('False', 'false') train_model_from_file(config, serialization_dir, overrides_str, True, False, True, "", "") elif args.command == 'evaluate': print(" evaluate . . . ") if args.models_dir is None: model_path = 'https://multiqa.s3.amazonaws.com/models/BERT' + args.bert_type + '/' + args.model + '.tar.gz' else: model_path = args.models_dir + args.model + '.tar.gz' model_cached_path = cached_path(model_path) print(" loading models . . . .") overrides_str = '' # Load from archive archive = load_archive(model_cached_path, int(args.cuda_device), overrides_str, '') prepare_environment(config_params) model = archive.model model.eval() print(" loading data . . . .") # Load the evaluation data validation_dataset_reader_params = config_params.get('validation_dataset_reader', None) dataset_reader = DatasetReader.from_params(validation_dataset_reader_params) # print(" * * * * * * * ") # print(validation_dataset_reader_params) # print(dataset_reader) print(" looping over datasets . . . .") # running over all validation datasets specified val_dataset_names = args.datasets.split(',') val_datasets = [args.data_dir + dataset + '_' + args.split + '.jsonl.gz' for dataset in val_dataset_names] for val_dataset_path,val_dataset_name in zip(val_datasets,val_dataset_names): print(f" * * * val_dataset_name: {val_dataset_name}") # This is a bit strange but there is a lot of config "popping" going on implicitly in allennlp # so we need to have the full config reloaded every iteration... config_params = Params(json.loads(_jsonnet.evaluate_file(config))) print("Reading evaluation data from %s", val_dataset_path) logger.info("Reading evaluation data from %s", val_dataset_path) instances = dataset_reader.read(val_dataset_path) # loading iterator iterator_params = config_params.get("validation_iterator", None) iterator = DataIterator.from_params(iterator_params) iterator.index_with(model.vocab) metrics = evaluate(model, instances, iterator, int(args.cuda_device), '') logger.info("Finished evaluating " + val_dataset_name) print("Finished evaluating " + val_dataset_name) logger.info("Metrics:") for key, metric in metrics.items(): logger.info("%s: %s", key, metric) if not os.path.exists(args.output_path): os.makedirs(args.output_path) output_path = args.output_path + args.model + '_BERT' + args.bert_type + '_eval-on_' \ + val_dataset_name + '_' + args.split + '.json' with open(output_path, "w") as file: json.dump(metrics, file, indent=4) return metrics elif args.command == 'generalize': logging.error('The command %s is not yet supported' % (args.command)) else: logging.error('The command %s is not supported' % (args.command))
def train_model(params: Params, serialization_dir: str) -> Model: """ This function can be used as an entry point to running models in AllenNLP directly from a JSON specification using a :class:`Driver`. Note that if you care about reproducibility, you should avoid running code using Pytorch or numpy which affect the reproducibility of your experiment before you import and use this function, these libraries rely on random seeds which can be set in this function via a JSON specification file. Note that this function performs training and will also evaluate the trained model on development and test sets if provided in the parameter json. Parameters ---------- params: Params, required. A parameter object specifying an AllenNLP Experiment. serialization_dir: str, required The directory in which to save results and logs. """ prepare_environment(params) os.makedirs(serialization_dir, exist_ok=True) sys.stdout = TeeLogger(os.path.join(serialization_dir, "stdout.log"), sys.stdout) # type: ignore sys.stderr = TeeLogger(os.path.join(serialization_dir, "stderr.log"), sys.stderr) # type: ignore handler = logging.FileHandler(os.path.join(serialization_dir, "python_logging.log")) handler.setLevel(logging.INFO) handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(name)s - %(message)s')) logging.getLogger().addHandler(handler) serialization_params = deepcopy(params).as_dict(quiet=True) with open(os.path.join(serialization_dir, "model_params.json"), "w") as param_file: json.dump(serialization_params, param_file, indent=4) # Now we begin assembling the required parts for the Trainer. dataset_reader = DatasetReader.from_params(params.pop('dataset_reader')) train_data_path = params.pop('train_data_path') logger.info("Reading training data from %s", train_data_path) train_data = dataset_reader.read(train_data_path) validation_data_path = params.pop('validation_data_path', None) if validation_data_path is not None: logger.info("Reading validation data from %s", validation_data_path) validation_data = dataset_reader.read(validation_data_path) combined_data = Dataset(train_data.instances + validation_data.instances) else: validation_data = None combined_data = train_data vocab = Vocabulary.from_params(params.pop("vocabulary", {}), combined_data) vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) model = Model.from_params(vocab, params.pop('model')) iterator = DataIterator.from_params(params.pop("iterator")) train_data.index_instances(vocab) if validation_data: validation_data.index_instances(vocab) trainer_params = params.pop("trainer") trainer = Trainer.from_params(model, serialization_dir, iterator, train_data, validation_data, trainer_params) params.assert_empty('base train command') trainer.train() # Now tar up results archive_model(serialization_dir) return model
def train_model(params: Params, serialization_dir: str, file_friendly_logging: bool = False, recover: bool = False) -> Model: """ Trains the model specified in the given :class:`Params` object, using the data and training parameters also specified in that object, and saves the results in ``serialization_dir``. Parameters ---------- params : ``Params`` A parameter object specifying an AllenNLP Experiment. serialization_dir : ``str`` The directory in which to save results and logs. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. recover : ``bool``, optional (default=False) If ``True``, we will try to recover a training run from an existing serialization directory. This is only intended for use when something actually crashed during the middle of a run. For continuing training a model on new data, see the ``fine-tune`` command. Returns ------- best_model: ``Model`` The model with the best epoch weights. """ prepare_environment(params) create_serialization_dir(params, serialization_dir, recover) prepare_global_logging(serialization_dir, file_friendly_logging) check_for_gpu(params.get('trainer').get('cuda_device', 0)) params.to_file(os.path.join(serialization_dir, CONFIG_NAME)) all_datasets = datasets_from_params(params) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError( f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info( "From dataset instances, %s will be considered for vocabulary creation.", ", ".join(datasets_for_vocab_creation)) vocab_dir = os.path.join(serialization_dir, 'vocabulary') vocab = Vocabulary.from_files(vocab_dir) # vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) model = Model.from_params(vocab=vocab, params=params.pop('model')) print(model) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(vocab) validation_iterator_params = params.pop("validation_iterator", None) if validation_iterator_params: validation_iterator = DataIterator.from_params( validation_iterator_params) validation_iterator.index_with(vocab) else: validation_iterator = None # train_data = all_datasets['train'] validation_data = all_datasets.get('validation') test_data = all_datasets.get('test') # trainer_params = params.pop("trainer") # no_grad_regexes = trainer_params.pop("no_grad", ()) # for name, parameter in model.named_parameters(): # if any(re.search(regex, name) for regex in no_grad_regexes): # parameter.requires_grad_(False) # frozen_parameter_names, tunable_parameter_names = \ # get_frozen_and_tunable_parameter_names(model) # logger.info("Following parameters are Frozen (without gradient):") # for name in frozen_parameter_names: # logger.info(name) # logger.info("Following parameters are Tunable (with gradient):") # for name in tunable_parameter_names: # logger.info(name) # trainer = Trainer.from_params(model, # serialization_dir, # iterator, # train_data, # validation_data, # trainer_params, # validation_iterator=validation_iterator) evaluate_on_validation = True evaluate_on_test = True # params.assert_empty('base train command') # try: # metrics = trainer.train() # except KeyboardInterrupt: # # if we have completed an epoch, try to create a model archive. # if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)): # logging.info("Training interrupted by the user. Attempting to create " # "a model archive using the current best epoch weights.") # archive_model(serialization_dir, files_to_archive=params.files_to_archive) # raise # Now tar up results # archive_model(serialization_dir, files_to_archive=params.files_to_archive) metrics = {} logger.info("Loading the best epoch weights.") best_model_state_path = os.path.join(serialization_dir, 'best.th') best_model_state = torch.load(best_model_state_path) best_model = model best_model.load_state_dict(best_model_state) best_model.cuda() if validation_data and evaluate_on_validation: logger.info( "The model will be evaluated using the best epoch weights on validation" ) validation_metrics = evaluate( best_model, validation_data, validation_iterator or iterator, cuda_device=0 # pylint: disable=protected-access ) for key, value in validation_metrics.items(): metrics["validation_" + key] = value if test_data and evaluate_on_test: logger.info( "The model will be evaluated using the best epoch weights on test") test_metrics = evaluate( best_model, test_data, validation_iterator or iterator, cuda_device=0 # pylint: disable=protected-access ) for key, value in test_metrics.items(): metrics["test_" + key] = value dump_metrics(os.path.join(serialization_dir, "metrics_new.json"), metrics, log=True) return best_model
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--input-file', type=str, help='path to the file containing the evaluation data') parser.add_argument('--output-file', type=str, help='path to output file') parser.add_argument('--weights-file', type=str, help='a path that overrides which weights file to use') parser.add_argument('--cuda-device', type=int, default=-1, help='id of GPU to use (if any)') parser.add_argument('--overrides', type=str, default="", help='a JSON structure used to override the experiment configuration') parser.add_argument('--include-package', type=str, default='') parser.add_argument('--archive-file', type=str) args = parser.parse_args() if '/' in args.weights_file: label_file = args.weights_file[:args.weights_file.rfind('/') + 1] else: label_file = '' label_file += (args.input_file[args.input_file.rfind('/') + 1: args.input_file.rfind('.')] if '/' in args.input_file else args.input_file[:args.input_file.rfind('.')]) label_file += '_reallabel_guessedlabel.csv' print("Will write labels to " + label_file) print("Evaluating on " + args.input_file) print("Archive file being used is " + args.archive_file) print("Weights file being used is " + args.weights_file) print() logging.getLogger('allennlp.common.params').disabled = True logging.getLogger('allennlp.nn.initializers').disabled = True logging.getLogger('allennlp.modules.token_embedders.embedding').setLevel(logging.INFO) if args.include_package.strip() != '': import_submodules(args.include_package) import_submodules("attn_tests_lib") import_submodules("textcat") if args.overrides != '': with open(args.overrides, 'r') as f: args.overrides = " ".join([l.strip() for l in f.readlines()]) archive = load_archive(args.archive_file, args.cuda_device, args.overrides, args.weights_file) config = archive.config prepare_environment(config) model = archive.model model.eval() if model._output_logit.get_output_dim() == 2: model.calculate_f1 = True model._f1 = F1Measure(1) validation_dataset_reader_params = config.pop('validation_dataset_reader', None) if validation_dataset_reader_params is not None: dataset_reader = DatasetReader.from_params(validation_dataset_reader_params) else: dataset_reader = DatasetReader.from_params(config.pop('dataset_reader')) evaluation_data_path = args.input_file logger.info("Reading evaluation data from %s", evaluation_data_path) instances = dataset_reader.read(evaluation_data_path) iterator_params = config.pop("validation_iterator", None) if iterator_params is None: iterator_params = config.pop("iterator") new_param_dict = {'type': 'basic'} if 'batch_size' in iterator_params.params: new_param_dict['batch_size'] = iterator_params.params['batch_size'] if 'maximum_samples_per_batch' in iterator_params.params: new_param_dict['maximum_samples_per_batch'] = iterator_params.params['maximum_samples_per_batch'] iterator_params.params = new_param_dict iterator = DataIterator.from_params(iterator_params) iterator.index_with(model.vocab) metrics = evaluate(model, instances, iterator, args.cuda_device, label_file) logger.info("Finished evaluating.") logger.info("Metrics:") for key, metric in metrics.items(): logger.info("%s: %s", key, metric) print('\n' + json.dumps(metrics, indent=4)) print("Successfully wrote labels to " + label_file) output_file = args.output_file if output_file: with open(output_file, "w") as file: json.dump(metrics, file, indent=4) return metrics
def train_model(params: Params, serialization_dir: str, file_friendly_logging: bool = False, recover: bool = False, force: bool = False) -> Model: """ Trains the model specified in the given :class:`Params` object, using the data and training parameters also specified in that object, and saves the results in ``serialization_dir``. Parameters ---------- params : ``Params`` A parameter object specifying an AllenNLP Experiment. serialization_dir : ``str`` The directory in which to save results and logs. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. recover : ``bool``, optional (default=False) If ``True``, we will try to recover a training run from an existing serialization directory. This is only intended for use when something actually crashed during the middle of a run. For continuing training a model on new data, see the ``fine-tune`` command. force : ``bool``, optional (default=False) If ``True``, we will overwrite the serialization directory if it already exists. Returns ------- best_model: ``Model`` The model with the best epoch weights. """ prepare_environment(params) create_serialization_dir(params, serialization_dir, recover, force) stdout_handler = prepare_global_logging(serialization_dir, file_friendly_logging) cuda_device = params.params.get('trainer').get('cuda_device', -1) check_for_gpu(cuda_device) params.to_file(os.path.join(serialization_dir, CONFIG_NAME)) evaluate_on_test = params.pop_bool("evaluate_on_test", False) trainer_type = params.get("trainer", {}).get("type", "default") if trainer_type == "default": # Special logic to instantiate backward-compatible trainer. pieces = TrainerPieces.from_params(params, serialization_dir, recover) # pylint: disable=no-member trainer = Trainer.from_params( model=pieces.model, serialization_dir=serialization_dir, iterator=pieces.iterator, train_data=pieces.train_dataset, validation_data=pieces.validation_dataset, params=pieces.params, validation_iterator=pieces.validation_iterator) evaluation_iterator = pieces.validation_iterator or pieces.iterator evaluation_dataset = pieces.test_dataset else: trainer = TrainerBase.from_params(params, serialization_dir, recover) # TODO(joelgrus): handle evaluation in the general case evaluation_iterator = evaluation_dataset = None params.assert_empty('base train command') try: metrics = trainer.train() except KeyboardInterrupt: # if we have completed an epoch, try to create a model archive. if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)): logging.info( "Training interrupted by the user. Attempting to create " "a model archive using the current best epoch weights.") archive_model(serialization_dir, files_to_archive=params.files_to_archive) raise # Evaluate if evaluation_dataset and evaluate_on_test: logger.info( "The model will be evaluated using the best epoch weights.") test_metrics = evaluate( trainer.model, evaluation_dataset, evaluation_iterator, cuda_device=trainer._cuda_devices[0], # pylint: disable=protected-access, # TODO(brendanr): Pass in an arg following Joel's trainer refactor. batch_weight_key="") for key, value in test_metrics.items(): metrics["test_" + key] = value elif evaluation_dataset: logger.info( "To evaluate on the test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.") cleanup_global_logging(stdout_handler) # Now tar up results archive_model(serialization_dir, files_to_archive=params.files_to_archive) dump_metrics(os.path.join(serialization_dir, "metrics.json"), metrics, log=True) # We count on the trainer to have the model with best weights return trainer.model
if root_path not in sys.path: sys.path.insert(0, root_path) import torch import torch.nn.functional as F import numpy from overrides import overrides from typing import Dict, Optional from allennlp.common.util import prepare_environment from allennlp.common.params import Params prepare_environment(Params(params={})) from allennlp.common import Params from allennlp.common.checks import ConfigurationError from allennlp.data import Vocabulary from allennlp.modules import FeedForward, Seq2VecEncoder, TextFieldEmbedder from allennlp.models.model import Model from allennlp.nn import InitializerApplicator, RegularizerApplicator from allennlp.nn import util from allennlp.training.metrics import CategoricalAccuracy, Average @Model.register("basic_classifier") class BasicClassifier(Model): def __init__(self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, sentence_encoder: Seq2VecEncoder,
def find_learning_rate_model(params: Params, serialization_dir: str, start_lr: float = 1e-5, end_lr: float = 10, num_batches: int = 100, linear_steps: bool = False, stopping_factor: float = None, force: bool = False) -> None: """ Runs learning rate search for given `num_batches` and saves the results in ``serialization_dir`` Parameters ---------- trainer: :class:`~allennlp.common.registrable.Registrable` params : ``Params`` A parameter object specifying an AllenNLP Experiment. serialization_dir : ``str`` The directory in which to save results. start_lr: ``float`` Learning rate to start the search. end_lr: ``float`` Learning rate upto which search is done. num_batches: ``int`` Number of mini-batches to run Learning rate finder. linear_steps: ``bool`` Increase learning rate linearly if False exponentially. stopping_factor: ``float`` Stop the search when the current loss exceeds the best loss recorded by multiple of stopping factor. If ``None`` search proceeds till the ``end_lr`` force: ``bool`` If True and the serialization directory already exists, everything in it will be removed prior to finding the learning rate. """ if os.path.exists(serialization_dir) and force: shutil.rmtree(serialization_dir) if os.path.exists(serialization_dir) and os.listdir(serialization_dir): raise ConfigurationError( f'Serialization directory {serialization_dir} already exists and is ' f'not empty.') else: os.makedirs(serialization_dir, exist_ok=True) prepare_environment(params) cuda_device = params.params.get('trainer').get('cuda_device', -1) if isinstance(cuda_device, list): for device in cuda_device: check_for_gpu(device) else: check_for_gpu(cuda_device) all_datasets = datasets_from_params(params) datasets_for_vocab_creation = set( params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError( f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info( "From dataset instances, %s will be considered for vocabulary creation.", ", ".join(datasets_for_vocab_creation)) vocab = Vocabulary.from_params( params.pop("vocabulary", {}), (instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation)) model = Model.from_params(vocab=vocab, params=params.pop('model')) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(vocab) train_data = all_datasets['train'] trainer_params = params.pop("trainer") no_grad_regexes = trainer_params.pop("no_grad", ()) for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad_regexes): parameter.requires_grad_(False) trainer = Trainer.from_params(model, serialization_dir, iterator, train_data, params=trainer_params, validation_data=None, validation_iterator=None) logger.info( f'Starting learning rate search from {start_lr} to {end_lr} in {num_batches} iterations.' ) learning_rates, losses = search_learning_rate( trainer, start_lr=start_lr, end_lr=end_lr, num_batches=num_batches, linear_steps=linear_steps, stopping_factor=stopping_factor) logger.info(f'Finished learning rate search.') losses = _smooth(losses, 0.98) _save_plot(learning_rates, losses, os.path.join(serialization_dir, 'lr-losses.png'))
def evaluate_from_args(args: argparse.Namespace) -> Dict[str, Any]: # Disable some of the more verbose logging statements logging.getLogger('allennlp.common.params').disabled = True logging.getLogger('allennlp.nn.initializers').disabled = True logging.getLogger('allennlp.modules.token_embedders.embedding').setLevel( logging.INFO) # Load from archive archive = load_archive(args.archive_file, args.cuda_device, args.overrides, args.weights_file) config = archive.config prepare_environment(config) model = archive.model model.eval() # Load the evaluation data # Try to use the validation dataset reader if there is one - otherwise fall back # to the default dataset_reader used for both training and validation. validation_dataset_reader_params = config.pop('validation_dataset_reader', None) if validation_dataset_reader_params is not None: dataset_reader = DatasetReader.from_params( validation_dataset_reader_params) else: dataset_reader = DatasetReader.from_params( config.pop('dataset_reader')) evaluation_data_path = args.input_file logger.info("Reading evaluation data from %s", evaluation_data_path) instances = dataset_reader.read(evaluation_data_path) embedding_sources: Dict[str, str] = (json.loads(args.embedding_sources_mapping) if args.embedding_sources_mapping else {}) if args.extend_vocab: logger.info("Vocabulary is being extended with test instances.") model.vocab.extend_from_instances(Params({}), instances=instances) model.extend_embedder_vocab(embedding_sources) iterator_params = config.pop("validation_iterator", None) if iterator_params is None: iterator_params = config.pop("iterator") iterator = DataIterator.from_params(iterator_params) iterator.index_with(model.vocab) thrs = args.thresholds.replace("_", ",").split(",") for thr in thrs: model._temperature_threshold = float(thr) metrics = evaluate(model, instances, iterator, args.cuda_device, args.batch_weight_key) logger.info("Finished evaluating.") logger.info("Metrics:") for key, metric in metrics.items(): logger.info("%s: %s: %s", thr, key, metric) output_file = args.output_file if output_file: with open(output_file + "_" + thr, "w") as file: json.dump(metrics, file, indent=4) return metrics
def find_learning_rate_model( params: Params, serialization_dir: str, start_lr: float = 1e-5, end_lr: float = 10, num_batches: int = 100, linear_steps: bool = False, stopping_factor: float = None, force: bool = False, ) -> None: """ Runs learning rate search for given `num_batches` and saves the results in ``serialization_dir`` # Parameters params : [`Params`](../common/params.md#params) A parameter object specifying an AllenNLP Experiment. serialization_dir : `str` The directory in which to save results. start_lr : `float` Learning rate to start the search. end_lr : `float` Learning rate upto which search is done. num_batches : `int` Number of mini-batches to run Learning rate finder. linear_steps : `bool` Increase learning rate linearly if False exponentially. stopping_factor : `float` Stop the search when the current loss exceeds the best loss recorded by multiple of stopping factor. If `None` search proceeds till the `end_lr` force : `bool` If True and the serialization directory already exists, everything in it will be removed prior to finding the learning rate. """ create_serialization_dir(params, serialization_dir, recover=False, force=force) prepare_environment(params) cuda_device = params.params.get("trainer").get("cuda_device", -1) check_for_gpu(cuda_device) distributed_params = params.params.get("distributed") # See https://github.com/allenai/allennlp/issues/3658 assert not distributed_params, "find-lr is not compatible with DistributedDataParallel." all_datasets = datasets_from_params(params) datasets_for_vocab_creation = set( params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError( f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info( "From dataset instances, %s will be considered for vocabulary creation.", ", ".join(datasets_for_vocab_creation), ) vocab = Vocabulary.from_params( params.pop("vocabulary", {}), instances=(instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation), ) train_data = all_datasets["train"] train_data.index_with(vocab) model = Model.from_params(vocab=vocab, params=params.pop("model")) data_loader = DataLoader.from_params(dataset=train_data, params=params.pop("data_loader")) trainer_params = params.pop("trainer") no_grad_regexes = trainer_params.pop("no_grad", ()) for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad_regexes): parameter.requires_grad_(False) trainer_choice = trainer_params.pop("type", "gradient_descent") if trainer_choice != "gradient_descent": raise ConfigurationError( "currently find-learning-rate only works with the GradientDescentTrainer" ) trainer: GradientDescentTrainer = Trainer.from_params( # type: ignore model=model, serialization_dir=serialization_dir, data_loader=data_loader, params=trainer_params, ) logger.info( f"Starting learning rate search from {start_lr} to {end_lr} in {num_batches} iterations." ) learning_rates, losses = search_learning_rate( trainer, start_lr=start_lr, end_lr=end_lr, num_batches=num_batches, linear_steps=linear_steps, stopping_factor=stopping_factor, ) logger.info(f"Finished learning rate search.") losses = _smooth(losses, 0.98) _save_plot(learning_rates, losses, os.path.join(serialization_dir, "lr-losses.png"))
def train_model(params: Params, serialization_dir: str) -> Model: """ This function can be used as an entry point to running models in AllenNLP directly from a JSON specification using a :class:`Driver`. Note that if you care about reproducibility, you should avoid running code using Pytorch or numpy which affect the reproducibility of your experiment before you import and use this function, these libraries rely on random seeds which can be set in this function via a JSON specification file. Note that this function performs training and will also evaluate the trained model on development and test sets if provided in the parameter json. Parameters ---------- params: Params, required. A parameter object specifying an AllenNLP Experiment. serialization_dir: str, required The directory in which to save results and logs. """ prepare_environment(params) os.makedirs(serialization_dir, exist_ok=True) sys.stdout = TeeLogger(os.path.join(serialization_dir, "stdout.log"), sys.stdout) # type: ignore sys.stderr = TeeLogger(os.path.join(serialization_dir, "stderr.log"), sys.stderr) # type: ignore handler = logging.FileHandler( os.path.join(serialization_dir, "python_logging.log")) handler.setLevel(logging.INFO) handler.setFormatter( logging.Formatter( '%(asctime)s - %(levelname)s - %(name)s - %(message)s')) logging.getLogger().addHandler(handler) serialization_params = deepcopy(params).as_dict(quiet=True) with open(os.path.join(serialization_dir, "model_params.json"), "w") as param_file: json.dump(serialization_params, param_file, indent=4) # Now we begin assembling the required parts for the Trainer. # 1. Primary training data. dataset_reader = DatasetReader.from_params(params.pop('dataset_reader')) train_data_path = params.pop('train_data_path') logger.info("Reading training data from %s", train_data_path) train_data = dataset_reader.read(train_data_path) # 2. Auxillary training data. dataset_reader_aux = DatasetReader.from_params( params.pop('dataset_reader_aux')) train_data_path_aux = params.pop('train_data_path_aux') logger.info("Reading auxilliary training data from %s", train_data_path_aux) train_data_aux = dataset_reader_aux.read(train_data_path_aux) # If only using a fraction of the auxiliary data. aux_sample_fraction = params.pop("aux_sample_fraction", 1.0) if aux_sample_fraction < 1.0: sample_size = int(aux_sample_fraction * len(train_data_aux.instances)) train_data_aux = Dataset( random.sample(train_data_aux.instances, sample_size)) # Balance the two datasets by inflating the size of the smaller dataset to the size of the larger dataset. train_size = len(train_data.instances) aux_train_size = len(train_data_aux.instances) mixing_ratio = params.pop("mixing_ratio") # mixing_ratio = float(train_size)/aux_train_size if train_size > aux_train_size: # case for PB scaffold. difference = train_size - aux_train_size aux_sample = [ random.choice(train_data_aux.instances) for _ in range(difference) ] train_data_aux = Dataset(train_data_aux.instances + aux_sample) logger.info( "Inflating auxiliary train data from {} to {} samples".format( aux_train_size, len(train_data_aux.instances))) # else: # case for FN scaffold. # difference = aux_train_size - train_size # train_sample = [random.choice(train_data.instances) for _ in range(difference)] # train_data = Dataset(train_data.instances + train_sample) # logger.info("Inflating train data from {} to {} samples".format( # train_size, len(train_data.instances))) all_datasets: Dict[str, Dataset] = {"train": train_data} all_datasets_aux: Dict[str, Dataset] = {"train_aux": train_data_aux} # 3. Primary validation data. validation_data_path = params.pop('validation_data_path', None) if validation_data_path is not None: logger.info("Reading validation data from %s", validation_data_path) validation_data = dataset_reader.read(validation_data_path) all_datasets["validation"] = validation_data else: validation_data = None # 4. Auxillary validation data. validation_data_path_aux = params.pop('validation_data_path_aux', None) if validation_data_path_aux is not None: logger.info("Reading auxilliary validation data from %s", validation_data_path_aux) validation_data_aux = dataset_reader_aux.read(validation_data_path_aux) all_datasets_aux["validation_aux"] = validation_data_aux else: validation_data_aux = None # 5. Primary test data test_data_path = params.pop("test_data_path", None) if test_data_path is not None: logger.info("Reading test data from %s", test_data_path) test_data = dataset_reader.read(test_data_path) all_datasets["test"] = test_data else: test_data = None # 6. Auxillary test data test_data_path_aux = params.pop("test_data_path_aux", None) if test_data_path_aux is not None: logger.info("Reading auxillary test data from %s", test_data_path_aux) test_data_aux = dataset_reader_aux.read(test_data_path_aux) all_datasets_aux["test_aux"] = test_data_aux else: test_data_aux = None datasets_for_vocab_creation = set( params.pop("datasets_for_vocab_creation", all_datasets)) datasets_for_vocab_creation_aux = set( params.pop("auxillary_datasets_for_vocab_creation", all_datasets_aux)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError( f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info( "Creating a vocabulary using %s data. Auxillary also included.", ", ".join(datasets_for_vocab_creation)) dataset_primary = Dataset([ instance for key, dataset in all_datasets.items() for instance in dataset.instances if key in datasets_for_vocab_creation ]) dataset_aux = Dataset([ instance for key, dataset in all_datasets_aux.items() for instance in dataset.instances if key in datasets_for_vocab_creation_aux ]) vocab = Vocabulary.from_params(params.pop("vocabulary", {}), dataset_primary, dataset_aux=dataset_aux) vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) model = Model.from_params(vocab, params.pop('model')) iterator = DataIterator.from_params(params.pop("iterator")) iterator_aux = DataIterator.from_params(params.pop("iterator_aux")) train_data.index_instances(vocab) train_data_aux.index_instances(vocab) if validation_data: validation_data.index_instances(vocab) if validation_data_aux: validation_data_aux.index_instances(vocab) cutoff_epoch = params.pop("cutoff_epoch", -1) trainer_params = params.pop("trainer") trainer = MultiTaskTrainer.from_params( model=model, serialization_dir=serialization_dir, iterator=iterator, iterator_aux=iterator_aux, train_dataset=train_data, train_dataset_aux=train_data_aux, mixing_ratio=mixing_ratio, cutoff_epoch=cutoff_epoch, validation_dataset=validation_data, validation_dataset_aux=validation_data_aux, params=trainer_params, files_to_archive=params.files_to_archive) evaluate_on_test = params.pop("evaluate_on_test", False) params.assert_empty('base train command') trainer.train() # Now tar up results archive_model(serialization_dir, files_to_archive=params.files_to_archive) if test_data and evaluate_on_test: test_data.index_instances(vocab) evaluate(model, test_data, iterator, cuda_device=trainer._cuda_device) # pylint: disable=protected-access elif test_data: logger.info( "To evaluate on the test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.") if test_data_aux and evaluate_on_test: test_data_aux.index_instances(vocab) evaluate(model, test_data_aux, iterator_aux, cuda_device=trainer._cuda_device) # pylint: disable=protected-access elif test_data_aux: logger.info( "To evaluate on the auxillary test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.") return model
def fine_tune_model(model: Model, params: Params, serialization_dir: str, file_friendly_logging: bool = False) -> Model: """ Fine tunes the given model, using a set of parameters that is largely identical to those used for :func:`~allennlp.commands.train.train_model`, except that the ``model`` section is ignored, if it is present (as we are already given a ``Model`` here). The main difference between the logic done here and the logic done in ``train_model`` is that here we do not worry about vocabulary construction or creating the model object. Everything else is the same. Parameters ---------- archive : ``Archive`` A saved model archive that is the result of running the ``train`` command. train_data_path : ``str`` Path to the training data to use for fine-tuning. serialization_dir : ``str`` The directory in which to save results and logs. validation_data_path : ``str``, optional Path to the validation data to use while fine-tuning. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. """ prepare_environment(params) os.makedirs(serialization_dir) # TODO(mattg): pull this block out into a separate function (maybe just add this to # `prepare_environment`?) Tqdm.set_slower_interval(file_friendly_logging) sys.stdout = TeeLogger( os.path.join(serialization_dir, "stdout.log"), # type: ignore sys.stdout, file_friendly_logging) sys.stderr = TeeLogger( os.path.join(serialization_dir, "stderr.log"), # type: ignore sys.stderr, file_friendly_logging) handler = logging.FileHandler( os.path.join(serialization_dir, "python_logging.log")) handler.setLevel(logging.INFO) handler.setFormatter( logging.Formatter( '%(asctime)s - %(levelname)s - %(name)s - %(message)s')) logging.getLogger().addHandler(handler) serialization_params = deepcopy(params).as_dict(quiet=True) with open(os.path.join(serialization_dir, CONFIG_NAME), "w") as param_file: json.dump(serialization_params, param_file, indent=4) if params.pop('model', None): logger.warning( "You passed parameters for the model in your configuration file, but we " "are ignoring them, using instead the model parameters in the archive." ) if params.pop('vocabulary', None): logger.warning( "You passed parameters for the vocabulary in your configuration file, but " "we are ignoring them, using instead the vocabulary from the saved model." ) vocab = model.vocab vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(vocab) all_datasets = datasets_from_params(params) train_data = all_datasets['train'] validation_data = all_datasets.get('validation') test_data = all_datasets.get('test') trainer_params = params.pop("trainer") trainer = Trainer.from_params(model, serialization_dir, iterator, train_data, validation_data, trainer_params) evaluate_on_test = params.pop_bool("evaluate_on_test", False) params.assert_empty('base train command') metrics = trainer.train() # Now tar up results archive_model(serialization_dir, files_to_archive=params.files_to_archive) if test_data and evaluate_on_test: test_metrics = evaluate(model, test_data, iterator, cuda_device=trainer._cuda_devices[0]) # pylint: disable=protected-access for key, value in test_metrics.items(): metrics["test_" + key] = value elif test_data: logger.info( "To evaluate on the test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.") metrics_json = json.dumps(metrics, indent=2) with open(os.path.join(serialization_dir, "metrics.json"), "w") as metrics_file: metrics_file.write(metrics_json) logger.info("Metrics: %s", metrics_json) return model
def fine_tune_model(model: Model, params: Params, serialization_dir: str, extend_vocab: bool = False, file_friendly_logging: bool = False) -> Model: """ Fine tunes the given model, using a set of parameters that is largely identical to those used for :func:`~allennlp.commands.train.train_model`, except that the ``model`` section is ignored, if it is present (as we are already given a ``Model`` here). The main difference between the logic done here and the logic done in ``train_model`` is that here we do not worry about vocabulary construction or creating the model object. Everything else is the same. Parameters ---------- archive : ``Archive`` A saved model archive that is the result of running the ``train`` command. train_data_path : ``str`` Path to the training data to use for fine-tuning. serialization_dir : ``str`` The directory in which to save results and logs. validation_data_path : ``str``, optional Path to the validation data to use while fine-tuning. extend_vocab: ``bool``, optional (default=False) If ``True``, we use the new instances to extend your vocabulary. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. """ prepare_environment(params) if os.path.exists(serialization_dir) and os.listdir(serialization_dir): raise ConfigurationError( f"Serialization directory ({serialization_dir}) " f"already exists and is not empty.") os.makedirs(serialization_dir, exist_ok=True) prepare_global_logging(serialization_dir, file_friendly_logging) serialization_params = deepcopy(params).as_dict(quiet=True) with open(os.path.join(serialization_dir, CONFIG_NAME), "w") as param_file: json.dump(serialization_params, param_file, indent=4) if params.pop('model', None): logger.warning( "You passed parameters for the model in your configuration file, but we " "are ignoring them, using instead the model parameters in the archive." ) vocabulary_params = params.pop('vocabulary', {}) if vocabulary_params.get('directory_path', None): logger.warning( "You passed `directory_path` in parameters for the vocabulary in " "your configuration file, but it will be ignored. ") all_datasets = datasets_from_params(params) vocab = model.vocab if extend_vocab: datasets_for_vocab_creation = set( params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError( f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info("Extending model vocabulary using %s data.", ", ".join(datasets_for_vocab_creation)) vocab.extend_from_instances( vocabulary_params, (instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation)) vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(model.vocab) validation_iterator_params = params.pop("validation_iterator", None) if validation_iterator_params: validation_iterator = DataIterator.from_params( validation_iterator_params) validation_iterator.index_with(vocab) else: validation_iterator = None train_data = all_datasets['train'] validation_data = all_datasets.get('validation') test_data = all_datasets.get('test') trainer_params = params.pop("trainer") no_grad_regexes = trainer_params.pop("no_grad", ()) for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad_regexes): parameter.requires_grad_(False) frozen_parameter_names, tunable_parameter_names = \ get_frozen_and_tunable_parameter_names(model) logger.info("Following parameters are Frozen (without gradient):") for name in frozen_parameter_names: logger.info(name) logger.info("Following parameters are Tunable (with gradient):") for name in tunable_parameter_names: logger.info(name) trainer_choice = trainer_params.pop_choice("type", Trainer.list_available(), default_to_first_choice=True) trainer = Trainer.by_name(trainer_choice).from_params( model=model, serialization_dir=serialization_dir, iterator=iterator, train_data=train_data, validation_data=validation_data, params=trainer_params, validation_iterator=validation_iterator) evaluate_on_test = params.pop_bool("evaluate_on_test", False) params.assert_empty('base train command') try: metrics = trainer.train() except KeyboardInterrupt: # if we have completed an epoch, try to create a model archive. if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)): logging.info( "Fine-tuning interrupted by the user. Attempting to create " "a model archive using the current best epoch weights.") archive_model(serialization_dir, files_to_archive=params.files_to_archive) raise # Now tar up results archive_model(serialization_dir, files_to_archive=params.files_to_archive) if test_data and evaluate_on_test: test_metrics = evaluate(model, test_data, iterator, cuda_device=trainer._cuda_devices[0]) # pylint: disable=protected-access for key, value in test_metrics.items(): metrics["test_" + key] = value elif test_data: logger.info( "To evaluate on the test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.") metrics_json = json.dumps(metrics, indent=2) with open(os.path.join(serialization_dir, "metrics.json"), "w") as metrics_file: metrics_file.write(metrics_json) logger.info("Metrics: %s", metrics_json) return model
if args.v: logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', level=logging.INFO) # turn on logging. # Disable some of the more verbose logging statements logging.getLogger('allennlp.common.params').disabled = True logging.getLogger('allennlp.nn.initializers').disabled = True logging.getLogger('allennlp.modules.token_embedders.embedding').setLevel( logging.INFO) # Load from archive archive = load_archive(args.archive_file, args.cuda_device, args.overrides, args.weights_file) config = archive.config config.formalism = "DUMMY" prepare_environment(config) model = archive.model model.eval() if not isinstance(model, GraphDependencyParser): raise ConfigurationError( "The loaded model seems not to be an am-parser (GraphDependencyParser)" ) # Load the evaluation data # Try to use the validation dataset reader if there is one - otherwise fall back # to the default dataset_reader used for both training and validation. validation_dataset_reader_params = config.pop('validation_dataset_reader', None) if validation_dataset_reader_params is not None: dataset_reader = DatasetReader.from_params(
def train_model(params: Params, serialization_dir: str, file_friendly_logging: bool = False, recover: bool = False) -> Model: """ Trains the model specified in the given :class:`Params` object, using the data and training parameters also specified in that object, and saves the results in ``serialization_dir``. Parameters ---------- params : ``Params`` A parameter object specifying an AllenNLP Experiment. serialization_dir : ``str`` The directory in which to save results and logs. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. recover : ``bool`, optional (default=False) If ``True``, we will try to recover a training run from an existing serialization directory. This is only intended for use when something actually crashed during the middle of a run. For continuing training a model on new data, see the ``fine-tune`` command. """ prepare_environment(params) create_serialization_dir(params, serialization_dir, recover) prepare_global_logging(serialization_dir, file_friendly_logging) serialization_params = deepcopy(params).as_dict(quiet=True) with open(os.path.join(serialization_dir, CONFIG_NAME), "w") as param_file: json.dump(serialization_params, param_file, indent=4) all_datasets = datasets_from_params(params) datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info("Creating a vocabulary using %s data.", ", ".join(datasets_for_vocab_creation)) vocab = Vocabulary.from_params(params.pop("vocabulary", {}), (instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation)) vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) model = Model.from_params(vocab, params.pop('model')) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(vocab) train_data = all_datasets['train'] validation_data = all_datasets.get('validation') test_data = all_datasets.get('test') trainer_params = params.pop("trainer") trainer = Trainer.from_params(model, serialization_dir, iterator, train_data, validation_data, trainer_params) evaluate_on_test = params.pop_bool("evaluate_on_test", False) params.assert_empty('base train command') try: metrics = trainer.train() except KeyboardInterrupt: # if we have completed an epoch, try to create a model archive. if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)): logging.info("Training interrupted by the user. Attempting to create " "a model archive using the current best epoch weights.") archive_model(serialization_dir, files_to_archive=params.files_to_archive) raise # Now tar up results archive_model(serialization_dir, files_to_archive=params.files_to_archive) if test_data and evaluate_on_test: test_metrics = evaluate(model, test_data, iterator, cuda_device=trainer._cuda_devices[0]) # pylint: disable=protected-access for key, value in test_metrics.items(): metrics["test_" + key] = value elif test_data: logger.info("To evaluate on the test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.") metrics_json = json.dumps(metrics, indent=2) with open(os.path.join(serialization_dir, "metrics.json"), "w") as metrics_file: metrics_file.write(metrics_json) logger.info("Metrics: %s", metrics_json) return model
def train_model(params: Params, serialization_dir: str, file_friendly_logging: bool = False, recover: bool = False, force: bool = False) -> Model: prepare_environment(params) create_serialization_dir(params, serialization_dir, recover, force) prepare_global_logging(serialization_dir, file_friendly_logging) cuda_device = params.params.get('trainer').get('cuda_device', -1) if isinstance(cuda_device, list): for device in cuda_device: check_for_gpu(device) else: check_for_gpu(cuda_device) params.to_file(os.path.join(serialization_dir, CONFIG_NAME)) all_datasets = datasets_from_params(params) datasets_for_vocab_creation = set( params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError( f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info( "From dataset instances, %s will be considered for vocabulary creation.", ", ".join(datasets_for_vocab_creation)) if recover and os.path.exists(os.path.join(serialization_dir, "vocabulary")): vocab = Vocabulary.from_files( os.path.join(serialization_dir, "vocabulary")) else: vocab = Vocabulary.from_params( params.pop("vocabulary", {}), (instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation)) model = Model.from_params(vocab=vocab, params=params.pop('model')) # Initializing the model can have side effect of expanding the vocabulary vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(vocab) validation_iterator_params = params.pop("validation_iterator", None) if validation_iterator_params: validation_iterator = DataIterator.from_params( validation_iterator_params) validation_iterator.index_with(vocab) else: validation_iterator = None train_data = all_datasets['train'] validation_data = all_datasets.get('validation') test_data = all_datasets.get('test') trainer_params = params.pop("trainer") no_grad_regexes = trainer_params.pop("no_grad", ()) for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad_regexes): parameter.requires_grad_(False) frozen_parameter_names, tunable_parameter_names = \ get_frozen_and_tunable_parameter_names(model) logger.info("Following parameters are Frozen (without gradient):") for name in frozen_parameter_names: logger.info(name) logger.info("Following parameters are Tunable (with gradient):") for name in tunable_parameter_names: logger.info(name) trainer_choice = trainer_params.pop_choice("type", Trainer.list_available(), default_to_first_choice=True) trainer = Trainer.by_name(trainer_choice).from_params( model=model, serialization_dir=serialization_dir, iterator=iterator, train_data=train_data, validation_data=validation_data, params=trainer_params, validation_iterator=validation_iterator) evaluate_on_test = params.pop_bool("evaluate_on_test", False) params.assert_empty('base train command') try: metrics = trainer.train() except KeyboardInterrupt: # if we have completed an epoch, try to create a model archive. if os.path.exists(os.path.join(serialization_dir, DEFAULT_WEIGHTS)): logging.info( "Training interrupted by the user. Attempting to create " "a model archive using the current best epoch weights.") archive_model(serialization_dir, files_to_archive=params.files_to_archive) raise # Now tar up results archive_model(serialization_dir, files_to_archive=params.files_to_archive) logger.info("Loading the best epoch weights.") best_model_state_path = os.path.join(serialization_dir, 'best.th') best_model_state = torch.load(best_model_state_path) best_model = model best_model.load_state_dict(best_model_state) if test_data and evaluate_on_test: logger.info( "The model will be evaluated using the best epoch weights.") test_metrics = evaluate( best_model, test_data, validation_iterator or iterator, cuda_device=trainer._cuda_devices[0] # pylint: disable=protected-access ) for key, value in test_metrics.items(): metrics["test_" + key] = value elif test_data: logger.info( "To evaluate on the test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.") dump_metrics(os.path.join(serialization_dir, "metrics.json"), metrics, log=True) return best_model
def fine_tune_model(model: Model, params: Params, serialization_dir: str, file_friendly_logging: bool = False) -> Model: """ Fine tunes the given model, using a set of parameters that is largely identical to those used for :func:`~allennlp.commands.train.train_model`, except that the ``model`` section is ignored, if it is present (as we are already given a ``Model`` here). The main difference between the logic done here and the logic done in ``train_model`` is that here we do not worry about vocabulary construction or creating the model object. Everything else is the same. Parameters ---------- archive : ``Archive`` A saved model archive that is the result of running the ``train`` command. train_data_path : ``str`` Path to the training data to use for fine-tuning. serialization_dir : ``str`` The directory in which to save results and logs. validation_data_path : ``str``, optional Path to the validation data to use while fine-tuning. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. """ prepare_environment(params) os.makedirs(serialization_dir) prepare_global_logging(serialization_dir, file_friendly_logging) serialization_params = deepcopy(params).as_dict(quiet=True) with open(os.path.join(serialization_dir, CONFIG_NAME), "w") as param_file: json.dump(serialization_params, param_file, indent=4) if params.pop('model', None): logger.warning( "You passed parameters for the model in your configuration file, but we " "are ignoring them, using instead the model parameters in the archive." ) if params.pop('vocabulary', None): logger.warning( "You passed parameters for the vocabulary in your configuration file, but " "we are ignoring them, using instead the vocabulary from the saved model." ) vocab = model.vocab vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(vocab) all_datasets = datasets_from_params(params) train_data = all_datasets['train'] validation_data = all_datasets.get('validation') test_data = all_datasets.get('test') trainer_params = params.pop("trainer") trainer = Trainer.from_params(model, serialization_dir, iterator, train_data, validation_data, trainer_params) evaluate_on_test = params.pop_bool("evaluate_on_test", False) params.assert_empty('base train command') try: metrics = trainer.train() except KeyboardInterrupt: # if we have completed an epoch, try to create a model archive. if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)): logging.info( "Fine-tuning interrupted by the user. Attempting to create " "a model archive using the current best epoch weights.") archive_model(serialization_dir, files_to_archive=params.files_to_archive) raise # Now tar up results archive_model(serialization_dir, files_to_archive=params.files_to_archive) if test_data and evaluate_on_test: test_metrics = evaluate(model, test_data, iterator, cuda_device=trainer._cuda_devices[0]) # pylint: disable=protected-access for key, value in test_metrics.items(): metrics["test_" + key] = value elif test_data: logger.info( "To evaluate on the test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.") metrics_json = json.dumps(metrics, indent=2) with open(os.path.join(serialization_dir, "metrics.json"), "w") as metrics_file: metrics_file.write(metrics_json) logger.info("Metrics: %s", metrics_json) return model
def fine_tune_model(model: Model, params: Params, serialization_dir: str, file_friendly_logging: bool = False) -> Model: """ Fine tunes the given model, using a set of parameters that is largely identical to those used for :func:`~allennlp.commands.train.train_model`, except that the ``model`` section is ignored, if it is present (as we are already given a ``Model`` here). The main difference between the logic done here and the logic done in ``train_model`` is that here we do not worry about vocabulary construction or creating the model object. Everything else is the same. Parameters ---------- archive : ``Archive`` A saved model archive that is the result of running the ``train`` command. train_data_path : ``str`` Path to the training data to use for fine-tuning. serialization_dir : ``str`` The directory in which to save results and logs. validation_data_path : ``str``, optional Path to the validation data to use while fine-tuning. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. """ prepare_environment(params) os.makedirs(serialization_dir) prepare_global_logging(serialization_dir, file_friendly_logging) serialization_params = deepcopy(params).as_dict(quiet=True) with open(os.path.join(serialization_dir, CONFIG_NAME), "w") as param_file: json.dump(serialization_params, param_file, indent=4) if params.pop('model', None): logger.warning("You passed parameters for the model in your configuration file, but we " "are ignoring them, using instead the model parameters in the archive.") vocabulary_params = params.pop('vocabulary', {}) if vocabulary_params.get('directory_path', None): logger.warning("You passed `directory_path` in parameters for the vocabulary in " "your configuration file, but it will be ignored. " "Vocabulary from the saved model will be extended with current data.") all_datasets = datasets_from_params(params) datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info("Extending model vocabulary using %s data.", ", ".join(datasets_for_vocab_creation)) vocab = model.vocab vocab.extend_from_instances(vocabulary_params, (instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation)) vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(vocab) train_data = all_datasets['train'] validation_data = all_datasets.get('validation') test_data = all_datasets.get('test') trainer_params = params.pop("trainer") no_grad_regexes = trainer_params.pop("no_grad", ()) for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad_regexes): parameter.requires_grad_(False) frozen_parameter_names, tunable_parameter_names = \ get_frozen_and_tunable_parameter_names(model) logger.info("Following parameters are Frozen (without gradient):") for name in frozen_parameter_names: logger.info(name) logger.info("Following parameters are Tunable (with gradient):") for name in tunable_parameter_names: logger.info(name) trainer = Trainer.from_params(model, serialization_dir, iterator, train_data, validation_data, trainer_params) evaluate_on_test = params.pop_bool("evaluate_on_test", False) params.assert_empty('base train command') try: metrics = trainer.train() except KeyboardInterrupt: # if we have completed an epoch, try to create a model archive. if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)): logging.info("Fine-tuning interrupted by the user. Attempting to create " "a model archive using the current best epoch weights.") archive_model(serialization_dir, files_to_archive=params.files_to_archive) raise # Now tar up results archive_model(serialization_dir, files_to_archive=params.files_to_archive) if test_data and evaluate_on_test: test_metrics = evaluate(model, test_data, iterator, cuda_device=trainer._cuda_devices[0]) # pylint: disable=protected-access for key, value in test_metrics.items(): metrics["test_" + key] = value elif test_data: logger.info("To evaluate on the test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.") metrics_json = json.dumps(metrics, indent=2) with open(os.path.join(serialization_dir, "metrics.json"), "w") as metrics_file: metrics_file.write(metrics_json) logger.info("Metrics: %s", metrics_json) return model
def train_model(db: FeverDocDB, params: Union[Params, Dict[str, Any]], cuda_device: int, serialization_dir: str) -> Model: """ This function can be used as an entry point to running models in AllenNLP directly from a JSON specification using a :class:`Driver`. Note that if you care about reproducibility, you should avoid running code using Pytorch or numpy which affect the reproducibility of your experiment before you import and use this function, these libraries rely on random seeds which can be set in this function via a JSON specification file. Note that this function performs training and will also evaluate the trained model on development and test sets if provided in the parameter json. Parameters ---------- params: Params, required. A parameter object specifying an AllenNLP Experiment. serialization_dir: str, required The directory in which to save results and logs. """ prepare_environment(params) os.makedirs(serialization_dir, exist_ok=True) sys.stdout = TeeLogger(os.path.join(serialization_dir, "stdout.log"), sys.stdout) # type: ignore sys.stderr = TeeLogger(os.path.join(serialization_dir, "stderr.log"), sys.stderr) # type: ignore handler = logging.FileHandler( os.path.join(serialization_dir, "python_logging.log")) handler.setLevel(logging.INFO) handler.setFormatter( logging.Formatter( '%(asctime)s - %(levelname)s - %(name)s - %(message)s')) logging.getLogger().addHandler(handler) serialization_params = deepcopy(params).as_dict(quiet=True) with open(os.path.join(serialization_dir, "model_params.json"), "w") as param_file: json.dump(serialization_params, param_file, indent=4) # Now we begin assembling the required parts for the Trainer. ds_params = params.pop('dataset_reader', {}) dataset_reader = FEVERSentenceReader( db, wiki_tokenizer=Tokenizer.from_params( ds_params.pop('wiki_tokenizer', {})), claim_tokenizer=Tokenizer.from_params( ds_params.pop('claim_tokenizer', {})), token_indexers=TokenIndexer.dict_from_params( ds_params.pop('token_indexers', {}))) train_data_path = params.pop('train_data_path') logger.info("Reading training data from %s", train_data_path) train_data = dataset_reader.read(train_data_path) all_datasets: List[Dataset] = [train_data] datasets_in_vocab = ["train"] validation_data_path = params.pop('validation_data_path', None) if validation_data_path is not None: logger.info("Reading validation data from %s", validation_data_path) validation_data = dataset_reader.read(validation_data_path) all_datasets.append(validation_data) datasets_in_vocab.append("validation") else: validation_data = None logger.info("Creating a vocabulary using %s data.", ", ".join(datasets_in_vocab)) vocab = Vocabulary.from_params( params.pop("vocabulary", {}), Dataset([ instance for dataset in all_datasets for instance in dataset.instances ])) vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) model = Model.from_params(vocab, params.pop('model')) iterator = DataIterator.from_params(params.pop("iterator")) train_data.index_instances(vocab) if validation_data: validation_data.index_instances(vocab) trainer_params = params.pop("trainer") if cuda_device is not None: args.trainer_params["cuda_device"] = cuda_device trainer = Trainer.from_params(model, serialization_dir, iterator, train_data, validation_data, trainer_params) trainer.train() # Now tar up results archive_model(serialization_dir) return model
def find_learning_rate_model(params: Params, serialization_dir: str, start_lr: float = 1e-5, end_lr: float = 10, num_batches: int = 100, linear_steps: bool = False, stopping_factor: float = None, force: bool = False) -> None: """ Runs learning rate search for given `num_batches` and saves the results in ``serialization_dir`` Parameters ---------- trainer: :class:`~allennlp.common.registrable.Registrable` params : ``Params`` A parameter object specifying an AllenNLP Experiment. serialization_dir : ``str`` The directory in which to save results. start_lr: ``float`` Learning rate to start the search. end_lr: ``float`` Learning rate upto which search is done. num_batches: ``int`` Number of mini-batches to run Learning rate finder. linear_steps: ``bool`` Increase learning rate linearly if False exponentially. stopping_factor: ``float`` Stop the search when the current loss exceeds the best loss recorded by multiple of stopping factor. If ``None`` search proceeds till the ``end_lr`` force: ``bool`` If True and the serialization directory already exists, everything in it will be removed prior to finding the learning rate. """ if os.path.exists(serialization_dir) and force: shutil.rmtree(serialization_dir) if os.path.exists(serialization_dir) and os.listdir(serialization_dir): raise ConfigurationError(f'Serialization directory {serialization_dir} already exists and is ' f'not empty.') else: os.makedirs(serialization_dir, exist_ok=True) prepare_environment(params) cuda_device = params.params.get('trainer').get('cuda_device', -1) if isinstance(cuda_device, list): for device in cuda_device: check_for_gpu(device) else: check_for_gpu(cuda_device) all_datasets = datasets_from_params(params) datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info("From dataset instances, %s will be considered for vocabulary creation.", ", ".join(datasets_for_vocab_creation)) vocab = Vocabulary.from_params( params.pop("vocabulary", {}), (instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation) ) model = Model.from_params(vocab=vocab, params=params.pop('model')) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(vocab) train_data = all_datasets['train'] trainer_params = params.pop("trainer") no_grad_regexes = trainer_params.pop("no_grad", ()) for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad_regexes): parameter.requires_grad_(False) trainer = Trainer.from_params(model, serialization_dir, iterator, train_data, params=trainer_params, validation_data=None, validation_iterator=None) logger.info(f'Starting learning rate search from {start_lr} to {end_lr} in {num_batches} iterations.') learning_rates, losses = search_learning_rate(trainer, start_lr=start_lr, end_lr=end_lr, num_batches=num_batches, linear_steps=linear_steps, stopping_factor=stopping_factor) logger.info(f'Finished learning rate search.') losses = _smooth(losses, 0.98) _save_plot(learning_rates, losses, os.path.join(serialization_dir, 'lr-losses.png'))
def train_model(args: argparse.Namespace, params: Params, serialization_dir: str, file_friendly_logging: bool = False, recover: bool = False) -> Model: """ Trains the model specified in the given :class:`Params` object, using the data and training parameters also specified in that object, and saves the results in ``serialization_dir``. Parameters ---------- params : ``Params`` A parameter object specifying an AllenNLP Experiment. serialization_dir : ``str`` The directory in which to save results and logs. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. recover : ``bool`, optional (default=False) If ``True``, we will try to recover a training run from an existing serialization directory. This is only intended for use when something actually crashed during the middle of a run. For continuing training a model on new data, see the ``fine-tune`` command. """ prepare_environment(params) create_serialization_dir(params, serialization_dir, recover) prepare_global_logging(serialization_dir, file_friendly_logging) serialization_params = deepcopy(params).as_dict(quiet=True) with open(os.path.join(serialization_dir, CONFIG_NAME), "w") as param_file: json.dump(serialization_params, param_file, indent=4) all_datasets = datasets_from_params(params) datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info("Creating a vocabulary using %s data.", ", ".join(datasets_for_vocab_creation)) if args.archive_file is not None: params.pop("vocabulary", {}) params.pop('model') archive = load_archive(args.archive_file, weights_file=args.weights_file, cuda_device=args.cuda_device, overrides=args.overrides) model = archive.model model.eval() vocab = model.vocab else: vocab = Vocabulary.from_params(params.pop("vocabulary", {}), (instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation)) vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) model = Model.from_params(vocab, params.pop('model')) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(vocab) train_data = all_datasets['train'] validation_data = all_datasets.get('validation') test_data = all_datasets.get('test') trainer_params = params.pop("trainer") trainer = Trainer.from_params(model, serialization_dir, iterator, train_data, validation_data, trainer_params) if args.archive_file is not None: trainer._patience = 10000 trainer._num_epochs = 10000 evaluate_on_test = params.pop_bool("evaluate_on_test", False) params.assert_empty('base train command') try: metrics = trainer.train() except KeyboardInterrupt: # if we have completed an epoch, try to create a model archive. if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)): logging.info("Training interrupted by the user. Attempting to create " "a model archive using the current best epoch weights.") archive_model(serialization_dir, files_to_archive=params.files_to_archive) raise # Now tar up results archive_model(serialization_dir, files_to_archive=params.files_to_archive) if test_data and evaluate_on_test: test_metrics = evaluate(model, test_data, iterator, cuda_device=trainer._cuda_devices[0]) # pylint: disable=protected-access for key, value in test_metrics.items(): metrics["test_" + key] = value elif test_data: logger.info("To evaluate on the test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.") metrics_json = json.dumps(metrics, indent=2) with open(os.path.join(serialization_dir, "metrics.json"), "w") as metrics_file: metrics_file.write(metrics_json) logger.info("Metrics: %s", metrics_json) return model
def main(args): params = Params.from_file(args.params_file) # print('Data seed:{}, Percent data: {}'.format(shuffle_id, train_size)) settings.cuda = params['cuda_device'] != -1 common_util.prepare_environment(params) serialization_dir = params['serialization_dir'] training_util.create_serialization_dir(params, serialization_dir, args.recover, args.force) common_util.prepare_global_logging(serialization_dir, True) logging.info( "torch version: {}, allennlp version: {}, allennlp path: {}".format( torch.__version__, allennlp.__version__, allennlp.__path__)) params.to_file(os.path.join(serialization_dir, CONFIG_NAME)) semi_supervision = params.get('semi_supervised', False) which_mixer = params.get('which_mixer', 'cm') dd_warmup_iters = params.pop('dd_warmup_iters', 1) dd_semi_warmup_iters = params.pop('dd_semi_warmup_iters', 1) dd_update_freq = params.pop('dd_update_freq', 2) constraints_wt = params.get('constraints_wt', 0) calc_valid_freq = params.get('calc_valid_freq', 1) backprop_after_xbatches = params.pop('backprop_after_xbatches', 1) min_pct_of_unlabelled = params.pop('min_pct_of_unlabelled', 0.0) dd_increase_freq_after = params.pop('dd_increase_freq_after', 0) dd_increase_freq_by = params.pop('dd_increase_freq_by', 0) dd_decay_lr = params.pop('dd_decay_lr', 0) dd_decay_lr_after = params.pop('dd_decay_lr_after', 1.0) grad_norm_before_warmup = params.pop('grad_norm_before_warmup', 0) if semi_supervision: print("Semi Supervision On") for key in [ 'warmup_epochs', 'unlabelled_train_data_file', 'test_data_file', 'data_dir', 'cuda_device', 'serialization_dir', 'train_data_file', 'validation_data_file', 'constraints_wt', 'train_size', 'shuffle_id', 'semi_supervised', 'which_mixer', 'distributed_lambda_update', 'calc_valid_freq' ]: params.pop(key, None) print("Trainer pieces") pieces = gan_trainer.TrainerPiecesForSemi.from_params( params, serialization_dir, args.recover, semi_supervision) # pylint: disable=no-member #pieces for constrained learning" print("Constraint model") constraints_model = Model.from_params(vocab=pieces.model.vocab, params=params.pop('dd_constraints')) dd_params = [[n, p] for n, p in constraints_model.named_parameters() if p.requires_grad] dd_optimizer = None dd_optim_params = params.pop('dd_optimizer', None) if len(dd_params) > 0: dd_optimizer = Optimizer.from_params(dd_params, dd_optim_params) cp = None chfile = None #Pdb().set_trace() if args.weight_dir is not None: #Pdb().set_trace() flag = True if args.weight_file is not None: logging.info("Loading Model weights from :{}".format( os.path.join(args.weight_dir, args.weight_file))) model_states = torch.load( os.path.join(args.weight_dir, args.weight_file)) pieces.model.load_state_dict(model_states) flag = False if args.dd_file is not None: logging.info("Loading Constraint Model from :{}".format( os.path.join(args.weight_dir, args.dd_file))) flag = False chfile = os.path.join(args.weight_dir, args.dd_file) # cp = torch.load(chfile) # constraints_model.load_state_dict(cp['constraints_model']) # if 'dd_update_freq' in cp: # dd_update_freq = cp['dd_update_freq'] # print("New dd_update_freq:" , dd_update_freq) if flag: raise ( "why provide args.weight_dir? when both weight_file and dd_file are None" ) print("Trainer") trainer = Trainer.from_params( model=pieces.model, serialization_dir=serialization_dir, iterator=pieces.iterator, train_data=pieces.train_dataset, validation_data=pieces.validation_dataset, params=pieces.params, validation_iterator=pieces.validation_iterator) if args.weight_dir is not None and args.training_state_file is not None: logging.info("Loading Training state from :{}".format( os.path.join(args.weight_dir, args.training_state_file))) training_state = torch.load( os.path.join(args.weight_dir, args.training_state_file)) trainer.optimizer.load_state_dict(training_state["optimizer"]) params.assert_empty('base train command') try: #if backprop_after_xbatches == 1: # print("Training setup") # semi_trainer= gan_trainer.SemiSupervisedTrainer(trainer, constraints_model, dd_optimizer, pieces.validation_iterator, pieces.unlabelled_dataset, semi_supervision, which_mixer, dd_warmup_iters, dd_update_freq, constraints_wt, calc_valid_freq) # print("Training start") # metrics = semi_trainer.custom_train() #else: print("Training setup") semi_trainer = gan_trainer.SemiSupervisedTrainer( trainer, constraints_model, dd_optimizer, pieces.validation_iterator, pieces.unlabelled_dataset, semi_supervision, which_mixer, dd_warmup_iters, dd_update_freq, constraints_wt, calc_valid_freq, backprop_after_xbatches, min_pct_of_unlabelled, dd_semi_warmup_iters, dd_increase_freq_after, dd_increase_freq_by, dd_decay_lr, args.debug, chfile=chfile, shuffle=args.shuffle, dd_decay_lr_after=dd_decay_lr_after, grad_norm_before_warmup=grad_norm_before_warmup) print("Training start") #print(yatin) metrics = semi_trainer.custom_train() except KeyboardInterrupt: # if we have completed an epoch, try to create a model archive. if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)): logging.info( "Training interrupted by the user. Attempting to create " "a model archive using the current best epoch weights.") archive_model(serialization_dir, files_to_archive=params.files_to_archive) raise archive_model(serialization_dir, files_to_archive=params.files_to_archive) common_util.dump_metrics(os.path.join(serialization_dir, "metrics.json"), metrics, log=True)
def evaluate_from_args(args: argparse.Namespace) -> Dict[str, Any]: common_logging.FILE_FRIENDLY_LOGGING = args.file_friendly_logging # Disable some of the more verbose logging statements logging.getLogger("allennlp.common.params").disabled = True logging.getLogger("allennlp.nn.initializers").disabled = True logging.getLogger("allennlp.modules.token_embedders.embedding").setLevel( logging.INFO) # Load from archive archive = load_archive( args.archive_file, weights_file=args.weights_file, cuda_device=args.cuda_device, overrides=args.overrides, ) config = archive.config prepare_environment(config) model = archive.model model.eval() # Load the evaluation data # Try to use the validation dataset reader if there is one - otherwise fall back # to the default dataset_reader used for both training and validation. validation_dataset_reader_params = config.pop("validation_dataset_reader", None) if validation_dataset_reader_params is not None: dataset_reader = DatasetReader.from_params( validation_dataset_reader_params) else: dataset_reader = DatasetReader.from_params( config.pop("dataset_reader")) evaluation_data_path = args.input_file logger.info("Reading evaluation data from %s", evaluation_data_path) instances = dataset_reader.read(evaluation_data_path) embedding_sources = (json.loads(args.embedding_sources_mapping) if args.embedding_sources_mapping else {}) if args.extend_vocab: logger.info("Vocabulary is being extended with test instances.") model.vocab.extend_from_instances(instances=instances) model.extend_embedder_vocab(embedding_sources) instances.index_with(model.vocab) data_loader_params = config.pop("validation_data_loader", None) if data_loader_params is None: data_loader_params = config.pop("data_loader") if args.batch_size: data_loader_params["batch_size"] = args.batch_size data_loader = DataLoader.from_params(dataset=instances, params=data_loader_params) metrics = evaluate(model, data_loader, args.cuda_device, args.batch_weight_key) logger.info("Finished evaluating.") dump_metrics(args.output_file, metrics, log=True) return metrics