def train_model( params: Params, serialization_dir: str, file_friendly_logging: bool = False, recover: bool = False, force: bool = False, node_rank: int = 0, include_package: List[str] = None, batch_weight_key: str = "", ) -> Model: """ Trains the model specified in the given :class:`Params` object, using the data and training parameters also specified in that object, and saves the results in ``serialization_dir``. # Parameters params : ``Params`` A parameter object specifying an AllenNLP Experiment. serialization_dir : ``str`` The directory in which to save results and logs. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. recover : ``bool``, optional (default=False) If ``True``, we will try to recover a training run from an existing serialization directory. This is only intended for use when something actually crashed during the middle of a run. For continuing training a model on new data, see ``Model.from_archive``. force : ``bool``, optional (default=False) If ``True``, we will overwrite the serialization directory if it already exists. node_rank : ``int``, optional Rank of the current node in distributed training include_package : ``List[str]``, optional In distributed mode, extra packages mentioned will be imported in trainer workers. batch_weight_key : ``str``, optional (default="") If non-empty, name of metric used to weight the loss on a per-batch basis. # Returns best_model : ``Model`` The model with the best epoch weights. """ training_util.create_serialization_dir(params, serialization_dir, recover, force) params.to_file(os.path.join(serialization_dir, CONFIG_NAME)) distributed_params = params.params.pop("distributed", None) # If distributed isn't in the config and the config contains strictly # one cuda device, we just run a single training process. if distributed_params is None: model = _train_worker( process_rank=0, params=params, serialization_dir=serialization_dir, file_friendly_logging=file_friendly_logging, include_package=include_package, batch_weight_key=batch_weight_key, ) archive_model(serialization_dir) return model # Otherwise, we are running multiple processes for training. else: # We are careful here so that we can raise a good error if someone # passed the wrong thing - cuda_devices are required. device_ids = distributed_params.pop("cuda_devices", None) multi_device = isinstance(device_ids, list) and len(device_ids) > 1 num_nodes = distributed_params.pop("num_nodes", 1) if not (multi_device or num_nodes > 1): raise ConfigurationError( "Multiple cuda devices/nodes need to be configured to run distributed training." ) check_for_gpu(device_ids) master_addr = distributed_params.pop("master_address", "127.0.0.1") master_port = distributed_params.pop("master_port", 29500) num_procs = len(device_ids) world_size = num_nodes * num_procs logging.info( f"Switching to distributed training mode since multiple GPUs are configured" f"Master is at: {master_addr}:{master_port} | Rank of this node: {node_rank} | " f"Number of workers in this node: {num_procs} | Number of nodes: {num_nodes} | " f"World size: {world_size}") # Creating `Vocabulary` objects from workers could be problematic since # the data iterators in each worker will yield only `rank` specific # instances. Hence it is safe to construct the vocabulary and write it # to disk before initializing the distributed context. The workers will # load the vocabulary from the path specified. if params.get("vocabulary", Params({})).get("type", "") != "from_files": vocab = training_util.make_vocab_from_params( params.duplicate(), serialization_dir) params["vocabulary"] = { "type": "from_files", "directory": os.path.join(serialization_dir, "vocabulary"), "padding_token": vocab._padding_token, "oov_token": vocab._oov_token, } mp.spawn( _train_worker, args=( params.duplicate(), serialization_dir, file_friendly_logging, include_package, batch_weight_key, node_rank, master_addr, master_port, world_size, device_ids, ), nprocs=num_procs, ) archive_model(serialization_dir) model = Model.load(params, serialization_dir) return model
def train_model(params: Params, serialization_dir: str, file_friendly_logging: bool = False, recover: bool = False, force: bool = False, cache_directory: str = None, cache_prefix: str = None) -> Model: """ Trains the model specified in the given :class:`Params` object, using the data and training parameters also specified in that object, and saves the results in ``serialization_dir``. Parameters ---------- params : ``Params`` A parameter object specifying an AllenNLP Experiment. serialization_dir : ``str`` The directory in which to save results and logs. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. recover : ``bool``, optional (default=False) If ``True``, we will try to recover a training run from an existing serialization directory. This is only intended for use when something actually crashed during the middle of a run. For continuing training a model on new data, see the ``fine-tune`` command. force : ``bool``, optional (default=False) If ``True``, we will overwrite the serialization directory if it already exists. cache_directory : ``str``, optional For caching data pre-processing. See :func:`allennlp.training.util.datasets_from_params`. cache_prefix : ``str``, optional For caching data pre-processing. See :func:`allennlp.training.util.datasets_from_params`. Returns ------- best_model: ``Model`` The model with the best epoch weights. """ create_serialization_dir(params, serialization_dir, recover, force) stdout_handler = prepare_global_logging(serialization_dir, file_friendly_logging) prepare_environment(params) cuda_device = params.params.get('trainer').get('cuda_device', -1) check_for_gpu(cuda_device) params.to_file(os.path.join(serialization_dir, CONFIG_NAME)) evaluate_on_test = params.pop_bool("evaluate_on_test", False) trainer_type = params.get("trainer", {}).get("type", "default") if trainer_type == "default": # Special logic to instantiate backward-compatible trainer. pieces = TrainerPieces.from_params( params, # pylint: disable=no-member serialization_dir, recover, cache_directory, cache_prefix) trainer = Trainer.from_params( model=pieces.model, serialization_dir=serialization_dir, iterator=pieces.iterator, train_data=pieces.train_dataset, validation_data=pieces.validation_dataset, params=pieces.params, validation_iterator=pieces.validation_iterator) evaluation_iterator = pieces.validation_iterator or pieces.iterator evaluation_dataset = pieces.test_dataset else: if evaluate_on_test: raise ValueError( "--evaluate-on-test only works with the default Trainer. " "If you're using the CallbackTrainer you can use a callback " "to evaluate at Events.TRAINING_END; otherwise you'll have " "to run allennlp evaluate separately.") trainer = TrainerBase.from_params(params, serialization_dir, recover, cache_directory, cache_prefix) evaluation_dataset = None params.assert_empty('base train command') try: metrics = trainer.train() except KeyboardInterrupt: # if we have completed an epoch, try to create a model archive. if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)): logging.info( "Training interrupted by the user. Attempting to create " "a model archive using the current best epoch weights.") archive_model(serialization_dir, files_to_archive=params.files_to_archive) raise # Evaluate if evaluation_dataset and evaluate_on_test: logger.info( "The model will be evaluated using the best epoch weights.") test_metrics = evaluate( trainer.model, evaluation_dataset, evaluation_iterator, cuda_device=trainer._cuda_devices[0], # pylint: disable=protected-access, # TODO(brendanr): Pass in an arg following Joel's trainer refactor. batch_weight_key="") for key, value in test_metrics.items(): metrics["test_" + key] = value elif evaluation_dataset: logger.info( "To evaluate on the test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.") cleanup_global_logging(stdout_handler) # Now tar up results archive_model(serialization_dir, files_to_archive=params.files_to_archive) dump_metrics(os.path.join(serialization_dir, "metrics.json"), metrics, log=True) # We count on the trainer to have the model with best weights return trainer.model
def train_model( params: Params, serialization_dir: Union[str, PathLike], recover: bool = False, force: bool = False, node_rank: int = 0, include_package: List[str] = None, dry_run: bool = False, file_friendly_logging: bool = False, ) -> Optional[Model]: """ Trains the model specified in the given [`Params`](../common/params.md#params) object, using the data and training parameters also specified in that object, and saves the results in `serialization_dir`. # Parameters params : `Params` A parameter object specifying an AllenNLP Experiment. serialization_dir : `str` The directory in which to save results and logs. recover : `bool`, optional (default=`False`) If `True`, we will try to recover a training run from an existing serialization directory. This is only intended for use when something actually crashed during the middle of a run. For continuing training a model on new data, see `Model.from_archive`. force : `bool`, optional (default=`False`) If `True`, we will overwrite the serialization directory if it already exists. node_rank : `int`, optional Rank of the current node in distributed training include_package : `List[str]`, optional In distributed mode, extra packages mentioned will be imported in trainer workers. dry_run : `bool`, optional (default=`False`) Do not train a model, but create a vocabulary, show dataset statistics and other training information. file_friendly_logging : `bool`, optional (default=`False`) If `True`, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. # Returns best_model : `Optional[Model]` The model with the best epoch weights or `None` if in dry run. """ common_logging.FILE_FRIENDLY_LOGGING = file_friendly_logging training_util.create_serialization_dir(params, serialization_dir, recover, force) params.to_file(os.path.join(serialization_dir, CONFIG_NAME)) distributed_params = params.params.pop("distributed", None) # If distributed isn't in the config and the config contains strictly # one cuda device, we just run a single training process. if distributed_params is None: model = _train_worker( process_rank=0, params=params, serialization_dir=serialization_dir, include_package=include_package, dry_run=dry_run, file_friendly_logging=file_friendly_logging, ) if not dry_run: archive_model(serialization_dir) return model # Otherwise, we are running multiple processes for training. else: # We are careful here so that we can raise a good error if someone # passed the wrong thing - cuda_devices are required. device_ids = distributed_params.pop("cuda_devices", None) multi_device = isinstance(device_ids, list) and len(device_ids) > 1 num_nodes = distributed_params.pop("num_nodes", 1) if not (multi_device or num_nodes > 1): raise ConfigurationError( "Multiple cuda devices/nodes need to be configured to run distributed training." ) check_for_gpu(device_ids) master_addr = distributed_params.pop("master_address", "127.0.0.1") master_port = distributed_params.pop("master_port", 29500) num_procs = len(device_ids) world_size = num_nodes * num_procs # Creating `Vocabulary` objects from workers could be problematic since # the data loaders in each worker will yield only `rank` specific # instances. Hence it is safe to construct the vocabulary and write it # to disk before initializing the distributed context. The workers will # load the vocabulary from the path specified. vocab_dir = os.path.join(serialization_dir, "vocabulary") if recover: vocab = Vocabulary.from_files(vocab_dir) else: vocab = training_util.make_vocab_from_params( params.duplicate(), serialization_dir, print_statistics=dry_run) params["vocabulary"] = { "type": "from_files", "directory": vocab_dir, "padding_token": vocab._padding_token, "oov_token": vocab._oov_token, } logging.info( "Switching to distributed training mode since multiple GPUs are configured | " f"Master is at: {master_addr}:{master_port} | Rank of this node: {node_rank} | " f"Number of workers in this node: {num_procs} | Number of nodes: {num_nodes} | " f"World size: {world_size}") mp.spawn( _train_worker, args=( params.duplicate(), serialization_dir, include_package, dry_run, node_rank, master_addr, master_port, world_size, device_ids, file_friendly_logging, ), nprocs=num_procs, ) if dry_run: return None else: archive_model(serialization_dir) model = Model.load(params, serialization_dir) return model
def find_learning_rate_model( params: Params, serialization_dir: str, start_lr: float = 1e-5, end_lr: float = 10, num_batches: int = 100, linear_steps: bool = False, stopping_factor: float = None, force: bool = False, ) -> None: """ Runs learning rate search for given `num_batches` and saves the results in ``serialization_dir`` # Parameters params : `Params` A parameter object specifying an AllenNLP Experiment. serialization_dir : `str` The directory in which to save results. start_lr : `float` Learning rate to start the search. end_lr : `float` Learning rate upto which search is done. num_batches : `int` Number of mini-batches to run Learning rate finder. linear_steps : `bool` Increase learning rate linearly if False exponentially. stopping_factor : `float` Stop the search when the current loss exceeds the best loss recorded by multiple of stopping factor. If `None` search proceeds till the `end_lr` force : `bool` If True and the serialization directory already exists, everything in it will be removed prior to finding the learning rate. """ create_serialization_dir(params, serialization_dir, recover=False, force=force) prepare_environment(params) cuda_device = params.params.get("trainer").get("cuda_device", -1) check_for_gpu(cuda_device) distributed_params = params.params.get("distributed") # See https://github.com/allenai/allennlp/issues/3658 assert not distributed_params, "find-lr is not compatible with DistributedDataParallel." all_datasets = datasets_from_params(params) datasets_for_vocab_creation = set( params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError( f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info( "From dataset instances, %s will be considered for vocabulary creation.", ", ".join(datasets_for_vocab_creation), ) vocab = Vocabulary.from_params( params.pop("vocabulary", {}), instances=(instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation), ) train_data = all_datasets["train"] train_data.index_with(vocab) model = Model.from_params(vocab=vocab, params=params.pop("model")) data_loader = DataLoader.from_params(dataset=train_data, params=params.pop("data_loader")) trainer_params = params.pop("trainer") no_grad_regexes = trainer_params.pop("no_grad", ()) for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad_regexes): parameter.requires_grad_(False) trainer_choice = trainer_params.pop("type", "gradient_descent") if trainer_choice != "gradient_descent": raise ConfigurationError( "currently find-learning-rate only works with the GradientDescentTrainer" ) trainer: GradientDescentTrainer = Trainer.from_params( # type: ignore model=model, serialization_dir=serialization_dir, data_loader=data_loader, params=trainer_params, ) logger.info( f"Starting learning rate search from {start_lr} to {end_lr} in {num_batches} iterations." ) learning_rates, losses = search_learning_rate( trainer, start_lr=start_lr, end_lr=end_lr, num_batches=num_batches, linear_steps=linear_steps, stopping_factor=stopping_factor, ) logger.info("Finished learning rate search.") losses = _smooth(losses, 0.98) _save_plot(learning_rates, losses, os.path.join(serialization_dir, "lr-losses.png"))
def train_model( params: Params, serialization_dir: str, file_friendly_logging: bool = False, recover: bool = False, force: bool = False, cache_directory: str = None, cache_prefix: str = None, node_rank: int = 0, include_package: List[str] = None, ) -> Model: """ Trains the model specified in the given :class:`Params` object, using the data and training parameters also specified in that object, and saves the results in ``serialization_dir``. # Parameters params : ``Params`` A parameter object specifying an AllenNLP Experiment. serialization_dir : ``str`` The directory in which to save results and logs. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. recover : ``bool``, optional (default=False) If ``True``, we will try to recover a training run from an existing serialization directory. This is only intended for use when something actually crashed during the middle of a run. For continuing training a model on new data, see the ``fine-tune`` command. force : ``bool``, optional (default=False) If ``True``, we will overwrite the serialization directory if it already exists. cache_directory : ``str``, optional For caching data pre-processing. See :func:`allennlp.training.util.datasets_from_params`. cache_prefix : ``str``, optional For caching data pre-processing. See :func:`allennlp.training.util.datasets_from_params`. node_rank : ``int``, optional Rank of the current node in distributed training include_package : ``List[str]``, optional In distributed mode, extra packages mentioned will be imported in trainer workers. # Returns best_model : ``Model`` The model with the best epoch weights. """ create_serialization_dir(params, serialization_dir, recover, force) params.to_file(os.path.join(serialization_dir, CONFIG_NAME)) distributed_params = params.params.pop("distributed", None) # If distributed isn't in the config and the config contains strictly # one cuda device, we just run a single training process. if distributed_params is None: model = _train_worker( process_rank=0, params=params, serialization_dir=serialization_dir, file_friendly_logging=file_friendly_logging, recover=recover, cache_directory=cache_directory, cache_prefix=cache_prefix, include_package=include_package, ) archive_model(serialization_dir, files_to_archive=params.files_to_archive) return model # Otherwise, we are running multiple processes for training. else: # We are careful here so that we can raise a good error if someone # passed the wrong thing - cuda_devices are required. device_ids = distributed_params.pop("cuda_devices", None) multi_device = isinstance(device_ids, list) and len(device_ids) > 1 num_nodes = distributed_params.pop("num_nodes", 1) if not (multi_device or num_nodes > 1): raise ConfigurationError( "Multiple cuda devices/nodes need to be configured to run distributed training." ) check_for_gpu(device_ids) master_addr = distributed_params.pop("master_address", "127.0.0.1") master_port = distributed_params.pop("master_port", 29500) num_procs = len(device_ids) world_size = num_nodes * num_procs os.environ["MASTER_ADDR"] = master_addr os.environ["MASTER_PORT"] = str(master_port) os.environ["WORLD_SIZE"] = str(world_size) logging.info( f"Switching to distributed training mode since multiple GPUs are configured" f"Master is at: {master_addr}:{master_port} | Rank of this node: {node_rank} | " f"Number of workers in this node: {num_procs} | Number of nodes: {num_nodes} | " f"World size: {world_size}") # Creating `Vocabulary` objects from workers could be problematic since the data iterators # in each worker will yield only `rank` specific instances. Hence it is safe to construct # the vocabulary and write it to disk before initializing the distributed context. The workers # will load the vocabulary from the path specified. make_vocab_from_params(params.duplicate(), serialization_dir) params["vocabulary"] = { "directory_path": os.path.join(serialization_dir, "vocabulary"), "extend": False, # vocab extension would have been done above } mp.spawn( _train_worker, args=( params.duplicate(), serialization_dir, file_friendly_logging, recover, cache_directory, cache_prefix, include_package, node_rank, master_addr, master_port, world_size, device_ids, ), nprocs=num_procs, ) archive_model(serialization_dir, files_to_archive=params.files_to_archive) model = Model.load(params, serialization_dir) return model
required=True, help="Path to parameter file describing the multi-tasked model to be trained.", type=str, ) parser.add_argument( "-r", "--recover", action="store_true", default=False, help="Recover a previous training from the state in serialization_dir.", ) args = parser.parse_args() params = Params.from_file(params_file=args.config_file_path) serialization_dir = args.serialization_dir create_serialization_dir(params, serialization_dir, args.recover,force=True) serialization_params = deepcopy(params).as_dict(quiet=True) with open(os.path.join(serialization_dir, "config.json"), "w") as param_file: json.dump(serialization_params, param_file, indent=4) ### Instantiate the different tasks from the param file, load datasets and create vocabulary ### tasks, vocab = tasks_and_vocab_from_params(params=params, serialization_dir=serialization_dir) ### Load the data iterators for each task ### tasks = create_and_set_iterators(params=params, task_list=tasks, vocab=vocab) ### Load Regularizations ### regularizer = RegularizerApplicator.from_params(params.pop("regularizer", [])) ### Create model ###
print("Semi Supervision On") #if semi_supervision: params['unlabelled_train_data_path'] = os.path.join( params['data_dir'], 'shuffle' + str(shuffle_id), params['unlabelled_train_data_file']) params['unlabelled_dataset_reader']['start_from'] = train_size params['dataset_reader']['how_many_sentences'] = train_size #params['model']['train_size'] = train_size params['serialization_dir'] = os.path.join( os.path.dirname(os.path.dirname(params['serialization_dir'])), 'shuffle' + str(shuffle_id), 'ts' + str(params['train_size'])) serialization_dir = params['serialization_dir'] training_util.create_serialization_dir(params, serialization_dir, args.recover, args.force) common_util.prepare_global_logging(serialization_dir, True) params.to_file(os.path.join(serialization_dir, CONFIG_NAME)) for key in [ 'warmup_epochs', 'unlabelled_train_data_file', 'test_data_file', 'data_dir', 'cuda_device', 'serialization_dir', 'train_data_file', 'validation_data_file', 'constraints_wt', 'train_size', 'shuffle_id', 'semi_supervised', 'which_mixer', 'distributed_lambda_update' ]: params.pop(key, None) #Pdb().set_trace() pieces = gan_trainer_hm.TrainerPiecesForSemi.from_params( params, serialization_dir, args.recover, semi_supervision) # pylint: disable=no-member
def train_model(params: Params, serialization_dir: str, file_friendly_logging: bool = False, recover: bool = False, force: bool = False, debate_mode: List[str] = ('f'), judge_filename: str = None, update_judge: bool = False, eval_mode: bool = False, reward_method: str = None, detach_value_head: bool = False, breakpoint_level: int = 0, search_outputs_path: str = None, accumulation_steps: int = 1, multi_gpu: bool = False, choice_mode: str = None, qa_loss_weight: float = 0., influence_reward: bool = False, theory_of_mind: bool = False, num_pred_rounds: int = -1, x_order_prob: float = 0., require_action: bool = False, single_shot: bool = False) -> Model: """ Trains the model specified in the given :class:`Params` object, using the data and training parameters also specified in that object, and saves the results in ``serialization_dir``. Parameters ---------- params : ``Params`` A parameter object specifying an AllenNLP Experiment. serialization_dir : ``str`` The directory in which to save results and logs. debate_mode : ``List[str]`` List of debate turns (e.g. aa, ar, rr, Ar) => capitalization implies search agent file_friendly_logging : ``bool``, optional (default=False) If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. recover : ``bool``, optional (default=False) If ``True``, we will try to recover a training run from an existing serialization directory. This is only intended for use when something actually crashed during the middle of a run. For continuing training a model on new data, see the ``fine-tune`` command. force : ``bool``, optional (default=False) If ``True``, we will overwrite the serialization directory if it already exists. judge_filename : ``str``, optional (default=None) Path to judge config or pre-trained judge model. If config, judge trained during debate. Necessary parameter if running in debate mode. update_judge : ``bool``, optional (default=False) Boolean whether or not to update Judge model during debate training. eval_mode : ``bool``, optional (default=False) Boolean whether or not to run in eval-only mode, on test data. Does not update/train any of the models. reward_method : ``str``, optional (default=False) Choice of reward function (RL) or loss function (Supervised Learning) for training debate agents detach_value_head : ``bool``, optional (default=False) Boolean whether or not to detatch value function gradient updates from the policy network. This prevents value function gradients from affecting policy network parameters. breakpoint_level : ``int`` optional (default=0) Debugging option to set breakpoint sensitivity (0 - no breakpoints). id_to_search_filename : ``str`` optional (default=None) Path to file with search predictions for each agent - necessary for supervised training accumulation_steps : ``int`` (default=1) Number of gradient steps to accumulate over before performing an update. Poor-man's batching for instances where number of examples per batch is small (limited GPU memory) multi_gpu : ``bool`` (default=False) Boolean whether or not to run models/training in model parallel mode. Requires specifying GPU allocations for trainer, judge, and debaters in the training config file (see training_config/bidaf.race.size=0.5.gpu=2.jsonnet for example usage). Returns ------- best_model: ``Model`` The model with the best epoch weights. """ assert ( not single_shot ) or eval_mode, 'Using single shot prediction outside eval_mode not yet supported.' assert (not single_shot) or (num_pred_rounds == -1), \ 'Using single shot prediction for a specific number of rounds is not yet supported.' # Get number of debate turns, and assert that not performing judge-only training num_no_qa_turns = sum([(('l' in debate_turn) or ('w' in debate_turn)) for debate_turn in debate_mode]) if (qa_loss_weight > 0) and (num_no_qa_turns == 0): warnings.warn( 'Unused argument qa_loss_weight in debate mode ' + str(debate_mode) + '. If this was unintentional, please remove the -q flag.', UserWarning) not_using_trained_debater = len( set('ablwⅰⅱⅲⅳ').intersection(''.join(debate_mode))) == 0 if (judge_filename is not None) and not_using_trained_debater: warnings.warn( 'Unnecessary to have debaters in debate mode ' + str(debate_mode) + '. If this was unintentional, please remove the -j flag.', UserWarning) prepare_environment(params) create_serialization_dir(params, serialization_dir, recover, force) prepare_global_logging(serialization_dir, file_friendly_logging) # Check that all Desired CUDA Devices exist => trainer => cuda_devices should contain list of required devices cuda_device = params.params.get('trainer').get('cuda_device', -1) check_for_gpu(cuda_device) # Build Allocation Dictionary (to be passed to all future functions) if multi_gpu: gpu_allocations, allocation_dict = params.params.pop( 'gpu_allocations', {}), {} assert len(gpu_allocations ) == 3, 'Must set gpu_allocations in config if multi-gpu' for k in ['debate', 'judge', 'trainer']: assert gpu_allocations[ k] in cuda_device, "Desired GPU not available... current: %s" % str( cuda_device) allocation_dict[k] = gpu_allocations[k] else: allocation_dict = {} params.to_file(os.path.join(serialization_dir, CONFIG_NAME)) evaluate_on_test = params.pop_bool("evaluate_on_test", False) trainer_type = params.get("trainer", {}).get("type", "default") if trainer_type == "default": # Special logic to instantiate backward-compatible trainer. params['dataset_reader'][ 'debate_mode'] = debate_mode # If debate_mode requires sample duplicates pieces = TrainerPieces.from_params(params, serialization_dir, cuda_device, recover, judge_filename=judge_filename, update_judge=update_judge, eval_mode=eval_mode, reward_method=reward_method, detach_value_head=detach_value_head, allocation_dict=allocation_dict, qa_loss_weight=qa_loss_weight, influence_reward=influence_reward, theory_of_mind=theory_of_mind) # pylint: disable=no-member trainer = Trainer.from_params( model=pieces.model, serialization_dir=serialization_dir, debate_mode=debate_mode, iterator=pieces.iterator, train_data=pieces.train_dataset, validation_data=pieces.validation_dataset, params=pieces.params, validation_iterator=pieces.validation_iterator, eval_mode=eval_mode, breakpoint_level=breakpoint_level, search_outputs_path=search_outputs_path, accumulation_steps=accumulation_steps, allocation_dict=allocation_dict, choice_mode=choice_mode, num_pred_rounds=num_pred_rounds, x_order_prob=x_order_prob, require_action=require_action, single_shot=single_shot) evaluation_iterator = pieces.validation_iterator or pieces.iterator evaluation_dataset = pieces.test_dataset else: assert (len(debate_mode) == 1) and (debate_mode[0] == 'f'), 'TrainerBase untested for debate training.' trainer = TrainerBase.from_params(params, serialization_dir, recover) evaluation_iterator = evaluation_dataset = None params.assert_empty('base train command') try: metrics = trainer.train() except KeyboardInterrupt: # if we have completed an epoch, try to create a model archive. if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)) and not eval_mode: logging.info( "Training interrupted by the user. Attempting to create " "a model archive using the current best epoch weights.") archive_model(serialization_dir, files_to_archive=params.files_to_archive) raise # Evaluate if evaluation_dataset and evaluate_on_test: logger.info( "The model will be evaluated using the best epoch weights.") test_metrics = evaluate( trainer.model, evaluation_dataset, evaluation_iterator, cuda_device=trainer._cuda_devices[0], # pylint: disable=protected-access, batch_weight_key="") for key, value in test_metrics.items(): metrics["test_" + key] = value elif evaluation_dataset: logger.info( "To evaluate on the test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.") # Now tar up results if not eval_mode: archive_model(serialization_dir, files_to_archive=params.files_to_archive) dump_metrics(os.path.join(serialization_dir, "metrics.json"), metrics, log=True) else: dump_metrics(os.path.join( serialization_dir, "metrics.eval.d=" + '-'.join(debate_mode) + ".json"), metrics, log=True) # We count on the trainer to have the model with best weights return trainer.model
def train_model(params: Params, serialization_dir: str, selector: str, num_ensemble_models: Optional[int], file_friendly_logging: bool = False, recover: bool = False, force: bool = False) -> Model: """ Trains the model specified in the given :class:`Params` object, using the data and training parameters also specified in that object, and saves the results in ``serialization_dir``. Parameters ---------- params : ``Params`` A parameter object specifying an AllenNLP Experiment. serialization_dir : ``str`` The directory in which to save results and logs. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. recover : ``bool``, optional (default=False) If ``True``, we will try to recover a training run from an existing serialization directory. This is only intended for use when something actually crashed during the middle of a run. For continuing training a model on new data, see the ``fine-tune`` command. Returns ------- best_model: ``Model`` The model with the best epoch weights. """ prepare_environment(params) create_serialization_dir(params, serialization_dir, recover, force) prepare_global_logging(serialization_dir, file_friendly_logging) cuda_device = params.params.get('trainer').get('cuda_device', -1) if isinstance(cuda_device, list): for device in cuda_device: check_for_gpu(device) else: check_for_gpu(cuda_device) params.to_file(os.path.join(serialization_dir, CONFIG_NAME)) all_datasets = datasets_from_params(params) datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info("From dataset instances, %s will be considered for vocabulary creation.", ", ".join(datasets_for_vocab_creation)) vocab = Vocabulary.from_params( params.pop("vocabulary", {}), (instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation) ) model_params = params.pop('model') if selector == 'qbc': assert num_ensemble_models is not None models_list = [Model.from_params(vocab=vocab, params=model_params.duplicate()) for i in range(num_ensemble_models)] ensemble_model = CorefEnsemble(models_list) model = ensemble_model.submodels[0] else: model = Model.from_params(vocab=vocab, params=model_params) ensemble_model = None # Initializing the model can have side effect of expanding the vocabulary vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(vocab) validation_iterator_params = params.pop("validation_iterator", None) if validation_iterator_params: validation_iterator = DataIterator.from_params(validation_iterator_params) validation_iterator.index_with(vocab) else: validation_iterator = None held_out_iterator_params = params.pop("held_out_iterator", None) if held_out_iterator_params: held_out_iterator = DataIterator.from_params(held_out_iterator_params) held_out_iterator.index_with(vocab) else: held_out_iterator = None train_data = all_datasets['train'] held_out_train_data = all_datasets.get('held_out_train') validation_data = all_datasets.get('validation') test_data = all_datasets.get('test') trainer_params = params.pop("trainer") no_grad_regexes = trainer_params.pop("no_grad", ()) for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad_regexes): parameter.requires_grad_(False) frozen_parameter_names, tunable_parameter_names = \ get_frozen_and_tunable_parameter_names(model) logger.info("Following parameters are Frozen (without gradient):") for name in frozen_parameter_names: logger.info(name) logger.info("Following parameters are Tunable (with gradient):") for name in tunable_parameter_names: logger.info(name) trainer_choice = trainer_params.pop("type") trainer = ALCorefTrainer.by_name(trainer_choice).from_params(model=model, serialization_dir=serialization_dir, iterator=iterator, train_data=train_data, held_out_train_data=held_out_train_data, validation_data=validation_data, params=trainer_params, validation_iterator=validation_iterator, held_out_iterator=held_out_iterator, ensemble_model=ensemble_model) evaluate_on_test = params.pop_bool("evaluate_on_test", False) params.assert_empty('base train command') try: metrics, query_info = trainer.train() except KeyboardInterrupt: # if we have completed an epoch, try to create a model archive. if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)): logging.info("Training interrupted by the user. Attempting to create " "a model archive using the current best epoch weights.") archive_model(serialization_dir, files_to_archive=params.files_to_archive) raise # Now tar up results archive_model(serialization_dir, files_to_archive=params.files_to_archive) best_model = None logger.info("Loading the best epoch weights.") best_model_state_path = os.path.join(serialization_dir, 'best.th') best_model_state = torch.load(best_model_state_path) best_model = model best_model.load_state_dict(best_model_state) if test_data and evaluate_on_test: logger.info("The model will be evaluated using the best epoch weights.") test_metrics = evaluate( best_model, test_data, validation_iterator or iterator, cuda_device=trainer._cuda_devices[0], batch_weight_key="", ) for key, value in test_metrics.items(): metrics["test_" + key] = value return best_model, metrics, query_info
from torchfly.modules.losses import SequenceFocalLoss, SequenceCrossEntropyLoss parser = argparse.ArgumentParser() parser.add_argument("-a", "--alpha", type=float, help="SIA alpha") parser.add_argument("-b", "--beta", type=float, help="SIA alpha") parser.add_argument("-o", "--output_dir", help="SIA alpha", default="results") args = parser.parse_args() # put it at the start of the file logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', level=logging.INFO) logger = logging.getLogger(__name__) create_serialization_dir(Params({"seed": 123}), args.output_dir, recover=False, force=True) stdout_handler = prepare_global_logging(serialization_dir=args.output_dir, file_friendly_logging=False) checkpointer = Checkpointer(args.output_dir, keep_serialized_model_every_num_seconds=3600) tokenizer = BertTokenizer.from_pretrained("bert-base-chinese") class SIAHeadlineDataset(Dataset): def __init__(self, data): super().__init__() self.dataset = data self.CLS = [101] self.SEP = [102]
def train_model( params: Params, serialization_dir: str, file_friendly_logging: bool = False, recover: bool = False, force: bool = False, cache_directory: str = None, cache_prefix: str = None, ) -> Model: create_serialization_dir(params, serialization_dir, recover, force) params.to_file(os.path.join(serialization_dir, CONFIG_NAME)) stdout_handler = prepare_global_logging(serialization_dir, file_friendly_logging) prepare_environment(params) cuda_device = params.params.get("trainer").get("cuda_device", -1) check_for_gpu(cuda_device) evaluate_on_test = params.pop_bool("evaluate_on_test", False) trainer_type = params.get("trainer", {}).get("type", "default") if True: # Special logic to instantiate backward-compatible trainer. pieces = TrainerPieces.from_params(params, serialization_dir, recover, cache_directory, cache_prefix) logger.info("Using MultiTrainer") from lm.trainining.MultiTaskTrainer import MultiTaskTrainer # MultiTrainer trainer = MultiTaskTrainer.from_params( model=pieces.model, serialization_dir=serialization_dir, iterator=pieces.iterator, train_data=pieces.train_dataset, validation_data=pieces.validation_dataset, params=pieces.params, validation_iterator=pieces.validation_iterator, ) evaluation_iterator = pieces.validation_iterator or pieces.iterator evaluation_dataset = pieces.test_dataset else: if evaluate_on_test: raise ValueError( "--evaluate-on-test only works with the default Trainer. " "If you're using the CallbackTrainer you can use a callback " "to evaluate at Events.TRAINING_END; otherwise you'll have " "to run allennlp evaluate separately.") """ The only main difference """ print("Using MultuTrainer") logger.info("Using MultiTrainer") trainer = MultiTrainer.from_params(params, serialization_dir, recover, cache_directory, cache_prefix) evaluation_dataset = None params.assert_empty("base train command") try: metrics = trainer.train() except KeyboardInterrupt: # if we have completed an epoch, try to create a model archive. if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)): logging.info( "Training interrupted by the user. Attempting to create " "a model archive using the current best epoch weights.") archive_model(serialization_dir, files_to_archive=params.files_to_archive) raise # Evaluate if evaluation_dataset and evaluate_on_test: logger.info( "The model will be evaluated using the best epoch weights.") test_metrics = evaluate( trainer.model, evaluation_dataset, evaluation_iterator, cuda_device=trainer._cuda_devices[0], # TODO(brendanr): Pass in an arg following Joel's trainer refactor. batch_weight_key="", ) for key, value in test_metrics.items(): metrics["test_" + key] = value elif evaluation_dataset: logger.info( "To evaluate on the test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.") cleanup_global_logging(stdout_handler) # Now tar up results archive_model(serialization_dir, files_to_archive=params.files_to_archive) dump_metrics(os.path.join(serialization_dir, "metrics.json"), metrics, log=True) # We count on the trainer to have the model with best weights return trainer.model
"Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome." .format(args.output_dir)) # Setup logging logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO) # Set seed set_seed(args) params = Params.from_file(args.config_file, args.overrides) cuda_device = params.params.get('trainer').get('cuda_device', -1) create_serialization_dir(params, args.output_dir, recover=False, force=True) params.to_file(os.path.join(args.output_dir, CONFIG_NAME)) teacher_path = params.pop("teacher_path", None) # support multi dataset training. all_datasets = training_util.datasets_from_params(params) # student model initialized from a pretrained QA model if args.model_archive is not None: student_model = load_archive(args.model_archive).model else: datasets_for_vocab_creation = set( params.pop("datasets_for_vocab_creation", all_datasets)) if args.recover and os.path.exists( os.path.join(args.output_dir, "vocabulary")):
def main(args): params = Params.from_file(args.params_file) # print('Data seed:{}, Percent data: {}'.format(shuffle_id, train_size)) settings.cuda = params['cuda_device'] != -1 common_util.prepare_environment(params) serialization_dir = params['serialization_dir'] training_util.create_serialization_dir(params, serialization_dir, args.recover, args.force) common_util.prepare_global_logging(serialization_dir, True) logging.info( "torch version: {}, allennlp version: {}, allennlp path: {}".format( torch.__version__, allennlp.__version__, allennlp.__path__)) params.to_file(os.path.join(serialization_dir, CONFIG_NAME)) semi_supervision = params.get('semi_supervised', False) which_mixer = params.get('which_mixer', 'cm') dd_warmup_iters = params.pop('dd_warmup_iters', 1) dd_semi_warmup_iters = params.pop('dd_semi_warmup_iters', 1) dd_update_freq = params.pop('dd_update_freq', 2) constraints_wt = params.get('constraints_wt', 0) calc_valid_freq = params.get('calc_valid_freq', 1) backprop_after_xbatches = params.pop('backprop_after_xbatches', 1) min_pct_of_unlabelled = params.pop('min_pct_of_unlabelled', 0.0) dd_increase_freq_after = params.pop('dd_increase_freq_after', 0) dd_increase_freq_by = params.pop('dd_increase_freq_by', 0) dd_decay_lr = params.pop('dd_decay_lr', 0) dd_decay_lr_after = params.pop('dd_decay_lr_after', 1.0) grad_norm_before_warmup = params.pop('grad_norm_before_warmup', 0) if semi_supervision: print("Semi Supervision On") for key in [ 'warmup_epochs', 'unlabelled_train_data_file', 'test_data_file', 'data_dir', 'cuda_device', 'serialization_dir', 'train_data_file', 'validation_data_file', 'constraints_wt', 'train_size', 'shuffle_id', 'semi_supervised', 'which_mixer', 'distributed_lambda_update', 'calc_valid_freq' ]: params.pop(key, None) print("Trainer pieces") pieces = gan_trainer.TrainerPiecesForSemi.from_params( params, serialization_dir, args.recover, semi_supervision) # pylint: disable=no-member #pieces for constrained learning" print("Constraint model") constraints_model = Model.from_params(vocab=pieces.model.vocab, params=params.pop('dd_constraints')) dd_params = [[n, p] for n, p in constraints_model.named_parameters() if p.requires_grad] dd_optimizer = None dd_optim_params = params.pop('dd_optimizer', None) if len(dd_params) > 0: dd_optimizer = Optimizer.from_params(dd_params, dd_optim_params) cp = None chfile = None #Pdb().set_trace() if args.weight_dir is not None: #Pdb().set_trace() flag = True if args.weight_file is not None: logging.info("Loading Model weights from :{}".format( os.path.join(args.weight_dir, args.weight_file))) model_states = torch.load( os.path.join(args.weight_dir, args.weight_file)) pieces.model.load_state_dict(model_states) flag = False if args.dd_file is not None: logging.info("Loading Constraint Model from :{}".format( os.path.join(args.weight_dir, args.dd_file))) flag = False chfile = os.path.join(args.weight_dir, args.dd_file) # cp = torch.load(chfile) # constraints_model.load_state_dict(cp['constraints_model']) # if 'dd_update_freq' in cp: # dd_update_freq = cp['dd_update_freq'] # print("New dd_update_freq:" , dd_update_freq) if flag: raise ( "why provide args.weight_dir? when both weight_file and dd_file are None" ) print("Trainer") trainer = Trainer.from_params( model=pieces.model, serialization_dir=serialization_dir, iterator=pieces.iterator, train_data=pieces.train_dataset, validation_data=pieces.validation_dataset, params=pieces.params, validation_iterator=pieces.validation_iterator) if args.weight_dir is not None and args.training_state_file is not None: logging.info("Loading Training state from :{}".format( os.path.join(args.weight_dir, args.training_state_file))) training_state = torch.load( os.path.join(args.weight_dir, args.training_state_file)) trainer.optimizer.load_state_dict(training_state["optimizer"]) params.assert_empty('base train command') try: #if backprop_after_xbatches == 1: # print("Training setup") # semi_trainer= gan_trainer.SemiSupervisedTrainer(trainer, constraints_model, dd_optimizer, pieces.validation_iterator, pieces.unlabelled_dataset, semi_supervision, which_mixer, dd_warmup_iters, dd_update_freq, constraints_wt, calc_valid_freq) # print("Training start") # metrics = semi_trainer.custom_train() #else: print("Training setup") semi_trainer = gan_trainer.SemiSupervisedTrainer( trainer, constraints_model, dd_optimizer, pieces.validation_iterator, pieces.unlabelled_dataset, semi_supervision, which_mixer, dd_warmup_iters, dd_update_freq, constraints_wt, calc_valid_freq, backprop_after_xbatches, min_pct_of_unlabelled, dd_semi_warmup_iters, dd_increase_freq_after, dd_increase_freq_by, dd_decay_lr, args.debug, chfile=chfile, shuffle=args.shuffle, dd_decay_lr_after=dd_decay_lr_after, grad_norm_before_warmup=grad_norm_before_warmup) print("Training start") #print(yatin) metrics = semi_trainer.custom_train() except KeyboardInterrupt: # if we have completed an epoch, try to create a model archive. if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)): logging.info( "Training interrupted by the user. Attempting to create " "a model archive using the current best epoch weights.") archive_model(serialization_dir, files_to_archive=params.files_to_archive) raise archive_model(serialization_dir, files_to_archive=params.files_to_archive) common_util.dump_metrics(os.path.join(serialization_dir, "metrics.json"), metrics, log=True)
def main(args: argparse.Namespace): for package_name in args.include_package: import_module_and_submodules(package_name) params = Params.from_file(args.param_path, args.overrides) random_seed, numpy_seed, pytorch_seed = 41, 11, 302 if not args.fix: random_seed, numpy_seed, pytorch_seed = random.randint( 0, 999999999), random.randint(0, 999999999), random.randint( 0, 999999999) params["random_seed"] = random_seed params["numpy_seed"] = numpy_seed params["pytorch_seed"] = pytorch_seed prepare_environment(params) serialization_dir = args.serialization_dir create_serialization_dir(params, serialization_dir, args.recover, args.force) prepare_global_logging(serialization_dir, args.file_friendly_logging) hyperparams = list( get_hyperparams(params.as_dict(infer_type_and_cast=True))) params.to_file(os.path.join(serialization_dir, CONFIG_NAME)) test_file = params.params.get("test_data_path", None) validation_data_path = params.get("validation_data_path", None) evaluate_on_test = params.pop_bool("evaluate_on_test", False) test_command = None if evaluate_on_test: test_command = BaseEvaluationCommand.from_params( params.pop("test_command")) cuda_device = params.params.get('trainer').get('cuda_device', -1) check_for_gpu(cuda_device) train_model = TrainPipelineModel.from_params( params=params, serialization_dir=serialization_dir, local_rank=0) trainer = train_model.trainer if trainer.validation_command is not None: trainer.validation_command.maybe_set_gold_file(validation_data_path) params.assert_empty('base train command') if args.comet is not None: experiment = Experiment(api_key=args.comet, workspace=args.workspace, project_name=args.project, parse_args=False, auto_output_logging=None) if args.tags: experiment.add_tags(args.tags) with open(args.param_path) as fil: code = "".join(fil.readlines()) code += "\n\n#=============Full details=============\n\n" full_details = _jsonnet.evaluate_file(args.param_path) code += full_details code += "\n\n#=============IMPORTANT: overwritten options============\n\n" code += args.overrides experiment.set_code(code, overwrite=True) for key, val in hyperparams: experiment.log_parameter(key, val) experiment.log_parameter("model_directory", serialization_dir) experiment.log_parameter("cuda_device", cuda_device) experiment.log_parameter("hostname", socket.gethostname()) experiment.log_parameter("random_seed", random_seed) experiment.log_parameter("numpy_seed", numpy_seed) experiment.log_parameter("pytorch_seed", pytorch_seed) else: experiment = None try: metrics = trainer.train(experiment) except KeyboardInterrupt: # if we have completed an epoch, try to create a model archive. if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)): logging.info( "Training interrupted by the user. Attempting to create " "a model archive using the current best epoch weights.") archive_model(serialization_dir) raise # Evaluate if test_file and evaluate_on_test: logger.info( "The model will be evaluated using the best epoch weights (see pred_test.txt)." ) trainer.annotator.annotate_file( trainer.model, test_file, os.path.join(serialization_dir, "pred_test.txt")) if test_command: logger.info("Comparing against gold standard.") test_command.maybe_set_gold_file(test_file) test_metrics = test_command.evaluate( os.path.join(serialization_dir, "pred_test.txt")) if experiment: with experiment.test(): experiment.log_metrics({ k: v for k, v in test_metrics.items() if np.isscalar(v) }) metrics = merge_dicts(metrics, "test", test_metrics) dump_metrics(os.path.join(serialization_dir, "metrics.json"), metrics, log=True) if not args.no_archive: # Now tar up results archive_model(serialization_dir)