def __init__( self, vocab: Vocabulary, generator: Params, encoder: Params, samples: int, lambda_init: float, desired_length: float, rationale_extractor: Model = None, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None, ): ''' The lagrangian relaxation code has been mostly copied from code accompanying original paper. ''' super(KumaraswamyGenEncClassifier, self).__init__(vocab, initializer, regularizer) self._vocabulary = vocab self._num_labels = self._vocabulary.get_vocab_size("labels") self._generator = Model.from_params( vocab=vocab, regularizer=regularizer, initializer=initializer, params=Params(generator), ) self._encoder = Model.from_params( vocab=vocab, regularizer=regularizer, initializer=initializer, params=Params(encoder), ) self._samples = samples self._desired_length = min(1.0, max(0.0, desired_length)) self._rationale_extractor = rationale_extractor self._loss_tracks = { k: Average() for k in [ "_lasso_loss", "_base_loss", "_rat_length", "_lambda0", "_c0_ma", "_c0" ] } s_min = torch.Tensor([-0.1]) s_max = torch.Tensor([1.1]) self.support = [s_min, s_max] self.lagrange_alpha = 0.9 self.lagrange_lr = 0.01 self.lambda_min = 1e-12 self.lambda_max = 5.0 self.register_buffer("lambda0", torch.full((1, ), lambda_init)) self.register_buffer("c0_ma", torch.full((1, ), 0.0)) # moving average initializer(self)
def __init__( self, vocab: Vocabulary, generator: Params, encoder: Params, samples: int, reg_loss_lambda: float, desired_length: float, reg_loss_mu: float, rationale_extractor: Model = None, supervise_rationale: bool = False, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None, ): super(BernoulliGenEncClassifier, self).__init__(vocab, initializer, regularizer) self._vocabulary = vocab self._num_labels = self._vocabulary.get_vocab_size("labels") self._generator = Model.from_params( vocab=vocab, regularizer=regularizer, initializer=initializer, params=Params(generator), supervise_rationale=supervise_rationale, max_length_ratio=desired_length, ) self._encoder = Model.from_params( vocab=vocab, regularizer=regularizer, initializer=initializer, params=Params(encoder), ) self._samples = samples self._reg_loss_lambda = reg_loss_lambda self._reg_loss_mu = reg_loss_mu self._desired_length = min(1.0, max(0.0, desired_length)) self._rationale_extractor = rationale_extractor self._loss_tracks = { k: Average() for k in [ "_lasso_loss", "_base_loss", "_rat_length", "_fused_lasso_loss", "_censored_lasso_loss", "_generator_loss", ] } self._supervise_rationale = supervise_rationale initializer(self)
def from_params(cls, params: Params, vocab: Vocabulary, **extras) -> 'QaSrlParser': span_detector_params = params.pop("span_detector") question_predictor_params = params.pop("question_predictor") span_detector = Model.from_params(vocab=vocab, params=span_detector_params) question_predictor = Model.from_params( vocab=vocab, params=question_predictor_params) return QaSrlParser(vocab, span_detector=span_detector, question_predictor=question_predictor)
def __init__( self, vocab: Vocabulary, generator: Params, encoder: Params, samples: int, reg_loss_lambda: float, desired_length: float, reg_loss_mu: float = 2, rationale_extractor: Model = None, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None, ): super(EncoderGeneratorModel, self).__init__(vocab, initializer, regularizer) self._vocabulary = vocab self._num_labels = self._vocabulary.get_vocab_size("labels") self._generator = Model.from_params(vocab=vocab, regularizer=regularizer, initializer=initializer, params=Params(generator)) self._encoder = Model.from_params(vocab=vocab, regularizer=regularizer, initializer=initializer, params=Params(encoder)) self._samples = samples self._reg_loss_lambda = reg_loss_lambda self._reg_loss_mu = reg_loss_mu self._desired_length = min(1.0, max(0.0, desired_length)) self._rationale_extractor = rationale_extractor self._loss_tracks = { k: Average() for k in [ "_lasso_loss", "_base_loss", "_rat_length", "_fused_lasso_loss", "_average_span_length" ] } s_min = torch.Tensor([-0.1]) s_max = torch.Tensor([1.1]) self.support = [s_min, s_max] # self.lagrange_alpha = 0.5 # self.lagrange_lr = 0.01 # self.register_buffer('lambda0', torch.full((1,), reg_loss_lambda)) # self.register_buffer('sparsity_ma', torch.full((1,), 0.)) # moving average initializer(self)
def __init__( self, vocab: Vocabulary, generator: Params, encoder: Params, samples: int, reg_loss_lambda: float, desired_length: float, reg_loss_mu: float, rationale_extractor: Model = None, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None, ): super(KumaraswamyGenEncClassifier, self).__init__(vocab, initializer, regularizer) self._vocabulary = vocab self._num_labels = self._vocabulary.get_vocab_size("labels") self._generator = Model.from_params( vocab=vocab, regularizer=regularizer, initializer=initializer, params=Params(generator), ) self._encoder = Model.from_params( vocab=vocab, regularizer=regularizer, initializer=initializer, params=Params(encoder), ) self._samples = samples self._reg_loss_lambda = reg_loss_lambda self._reg_loss_mu = reg_loss_mu self._desired_length = min(1.0, max(0.0, desired_length)) self._rationale_extractor = rationale_extractor self._loss_tracks = { k: Average() for k in [ "_lasso_loss", "_base_loss", "_rat_length", "_fused_lasso_loss", "_censored_lasso_loss", "_generator_loss" ] } s_min = torch.Tensor([-0.1]) s_max = torch.Tensor([1.1]) self.support = [s_min, s_max] initializer(self)
def load_teacher_model(teacher_path=None, device=-1): models = {} if teacher_path is not None: for tea_path in teacher_path.split(","): # teacher path is something like "Models/HotpotQA,Models/SQuAD" tea_name = tea_path.split("/")[-1] config = Params.from_file(os.path.join(tea_path, CONFIG_NAME)) vocab_tea = Vocabulary.from_files( os.path.join(tea_path, "vocabulary")) model = Model.from_params(vocab=vocab_tea, params=config.get("model")) tea_model = copy.deepcopy(model) model_state = torch.load( os.path.join(tea_path, "best.th"), map_location=nn_util.device_mapping(cuda_device)) tea_model.load_state_dict(model_state) logger.info("Load teacher model from %s", tea_path) # freeze the parameters of teacher model for p in tea_model.parameters(): p.requires_grad = False if device >= 0: tea_model.to(device=device) models[tea_name] = tea_model return models
def _load(cls, config: Params, serialization_dir: str, weights_file: str = None, cuda_device: int = -1) -> 'Model': """ Ensembles don't have vocabularies or weights of their own, so they override _load. """ model_params = config.get('model') # The experiment config tells us how to _train_ a model, including where to get pre-trained # embeddings from. We're now _loading_ the model, so those embeddings will already be # stored in our weights. We don't need any pretrained weight file anymore, and we don't # want the code to look for it, so we remove it from the parameters here. remove_pretrained_embedding_params(model_params) model = Model.from_params(vocab=None, params=model_params) # Force model to cpu or gpu, as appropriate, to make sure that the embeddings are # in sync with the weights if cuda_device >= 0: model.cuda(cuda_device) else: model.cpu() return model
def train(model_path, train_path, val_path, seed, vocabulary_path=None, config_path=None): assert os.path.isdir(model_path), "Model directory does not exist" set_seed(seed) config_path = config_path or os.path.join(model_path, "config.json") assert os.path.isfile(config_path), "Config file does not exist" params = Params.from_file(config_path) vocabulary_path = vocabulary_path or os.path.join(model_path, "vocabulary") assert os.path.exists(vocabulary_path), "Vocabulary is not ready, do not forget to run preprocess.py first" vocabulary = Vocabulary.from_files(vocabulary_path) reader_params = params.duplicate().pop("reader", default=Params({})) reader = DatasetReader.from_params(reader_params) train_dataset = reader.read(train_path) val_dataset = reader.read(val_path) if val_path else None model_params = params.pop("model") model = Model.from_params(model_params, vocab=vocabulary) print(model) print("Trainable params count: ", sum(p.numel() for p in model.parameters() if p.requires_grad)) iterator = DataIterator.from_params(params.pop('iterator')) iterator.index_with(vocabulary) trainer = Trainer.from_params(model, model_path, iterator, train_dataset, val_dataset, params.pop('trainer')) trainer.train()
def from_params(params: Params, iterator, val_iterator, vocab, all_datasets, serialization_dir: str, recover: bool = False) -> \ 'TrainerPieces': model = Model.from_params(vocab=vocab, params=params.pop('model')) # Initializing the model can have side effect of expanding the vocabulary vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) train_data = all_datasets['train'] validation_data = all_datasets.get('validation') test_data = all_datasets.get('test') trainer_params = params.pop("trainer") no_grad_regexes = trainer_params.pop("no_grad", ()) for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad_regexes): parameter.requires_grad_(False) frozen_parameter_names, tunable_parameter_names = \ get_frozen_and_tunable_parameter_names(model) logger.info("Following parameters are Frozen (without gradient):") for name in frozen_parameter_names: logger.info(name) logger.info("Following parameters are Tunable (with gradient):") for name in tunable_parameter_names: logger.info(name) return TrainerPieces(model, iterator, train_data, validation_data, test_data, val_iterator, trainer_params)
def _load( cls, config: Params, serialization_dir: str, weights_file: Optional[str] = None, cuda_device: int = -1, opt_level: Optional[str] = None, ) -> Model: """ Ensembles don't have vocabularies or weights of their own, so they override _load. """ if opt_level is not None: raise NotImplementedError(f"{cls.__name__} does not support AMP yet.") model_params = config.get("model") # The experiment config tells us how to _train_ a model, including where to get pre-trained # embeddings from. We're now _loading_ the model, so those embeddings will already be # stored in our weights. We don't need any pretrained weight file anymore, and we don't # want the code to look for it, so we remove it from the parameters here. remove_weights_related_keys_from_params(model_params) model = Model.from_params(vocab=None, params=model_params) # Force model to cpu or gpu, as appropriate, to make sure that the embeddings are # in sync with the weights if cuda_device >= 0: model.cuda(cuda_device) else: model.cpu() return model
def from_params(params: Params, serialization_dir: str, recover: bool = False) -> 'TrainerPieces': all_datasets = training_util.datasets_from_params(params) datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info("From dataset instances, %s will be considered for vocabulary creation.", ", ".join(datasets_for_vocab_creation)) if recover and os.path.exists(os.path.join(serialization_dir, "vocabulary")): vocab = Vocabulary.from_files(os.path.join(serialization_dir, "vocabulary")) params.pop("vocabulary", {}) else: vocab = Vocabulary.from_params( params.pop("vocabulary", {}), (instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation) ) model = Model.from_params(vocab=vocab, params=params.pop('model')) # Initializing the model can have side effect of expanding the vocabulary vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(model.vocab) validation_iterator_params = params.pop("validation_iterator", None) if validation_iterator_params: validation_iterator = DataIterator.from_params(validation_iterator_params) validation_iterator.index_with(model.vocab) else: validation_iterator = None train_data = all_datasets['train'] validation_data = all_datasets.get('validation') test_data = all_datasets.get('test') trainer_params = params.pop("trainer") no_grad_regexes = trainer_params.pop("no_grad", ()) for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad_regexes): parameter.requires_grad_(False) frozen_parameter_names, tunable_parameter_names = \ get_frozen_and_tunable_parameter_names(model) logger.info("Following parameters are Frozen (without gradient):") for name in frozen_parameter_names: logger.info(name) logger.info("Following parameters are Tunable (with gradient):") for name in tunable_parameter_names: logger.info(name) return TrainerPieces(model, iterator, train_data, validation_data, test_data, validation_iterator, trainer_params)
def __init__( self, vocab: Vocabulary, generator: Params, encoder: Params, samples: int, reg_loss_lambda: float, desired_length: float, reg_loss_mu: float = 2, rationale_extractor: Model = None, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None, ): super(EncoderGeneratorModel, self).__init__(vocab, initializer, regularizer) self._vocabulary = vocab self._num_labels = self._vocabulary.get_vocab_size("labels") self._generator = Model.from_params(vocab=vocab, regularizer=regularizer, initializer=initializer, params=Params(generator)) self._encoder = Model.from_params(vocab=vocab, regularizer=regularizer, initializer=initializer, params=Params(encoder)) self._samples = samples self._reg_loss_lambda = reg_loss_lambda self._reg_loss_mu = reg_loss_mu self._desired_length = min(1.0, max(0.0, desired_length)) self._rationale_extractor = rationale_extractor self._loss_tracks = { k: Average() for k in [ "_lasso_loss", "_base_loss", "_rat_length", "_fused_lasso_loss", "_average_span_length" ] } initializer(self)
def from_params(cls, vocab: Vocabulary, params: Params) -> 'GraphAndTextModel': text_model = params.pop("text_model", None) text_model = Model.from_params(vocab, text_model) graph_model = params.pop("graph_model") graph_model = Model.from_params(vocab, graph_model) classify_feed_forward = FeedForward.from_params( params.pop('classify_feed_forward')) initializer = InitializerApplicator.from_params( params.pop('initializer', [])) regularizer = RegularizerApplicator.from_params( params.pop('regularizer', [])) params.assert_empty(cls.__name__) return cls(vocab=vocab, classify_feed_forward=classify_feed_forward, text_model=text_model, graph_model=graph_model, initializer=initializer, regularizer=regularizer)
def openie_model(serialization_dir, weights_file=None, cuda_device=-1): """ Instantiates an already-trained model, based on the experiment configuration and some optional overrides. """ # Load config config = Params.from_file(os.path.join(serialization_dir, CONFIG_NAME), "") config.loading_from_archive = True config = config.duplicate() weights_file = weights_file or os.path.join(serialization_dir, _WEIGHTS_NAME) # Load vocabulary from file vocab_dir = os.path.join(serialization_dir, "vocabulary") # If the config specifies a vocabulary subclass, we need to use it. vocab_params = config.get("vocabulary", Params({})) vocab_choice = vocab_params.pop_choice("type", Vocabulary.list_available(), True) vocab = Vocabulary.by_name(vocab_choice).from_files(vocab_dir) model_params = config.get("model") # The experiment config tells us how to _train_ a model, including where to get pre-trained # embeddings from. We're now _loading_ the model, so those embeddings will already be # stored in our weights. We don't need any pretrained weight file anymore, and we don't # want the code to look for it, so we remove it from the parameters here. remove_pretrained_embedding_params(model_params) model = Model.from_params(vocab=vocab, params=model_params) # If vocab+embedding extension was done, the model initialized from from_params # and one defined by state dict in weights_file might not have same embedding shapes. # Eg. when model embedder module was transferred along with vocab extension, the # initialized embedding weight shape would be smaller than one in the state_dict. # So calling model embedding extension is required before load_state_dict. # If vocab and model embeddings are in sync, following would be just a no-op. model.extend_embedder_vocab() model_state = torch.load(weights_file, map_location=util.device_mapping(cuda_device)) model.load_state_dict(model_state) # Force model to cpu or gpu, as appropriate, to make sure that the embeddings are # in sync with the weights if cuda_device >= 0: model.cuda(cuda_device) else: model.cpu() return model, config
def __init__( self, vocab: Vocabulary, generator: Params, encoder: Params, reg_loss_lambda: float, reg_loss_mu: float = 2, reinforce_loss_weight: float = 1.0, rationale_supervision_loss_weight: float = 1.0, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None, ): super(EncoderGeneratorModel, self).__init__(vocab, initializer, regularizer) self._vocabulary = vocab self._num_labels = self._vocabulary.get_vocab_size("labels") self._generator = Model.from_params(vocab=vocab, regularizer=regularizer, initializer=initializer, params=Params(generator)) self._encoder = Model.from_params(vocab=vocab, regularizer=regularizer, initializer=initializer, params=Params(encoder)) self._reg_loss_lambda = reg_loss_lambda self._reg_loss_mu = reg_loss_mu self._reinforce_loss_weight = reinforce_loss_weight self._rationale_supervision_loss_weight = rationale_supervision_loss_weight self._loss_tracks = { k: Average() for k in ["lasso_loss", "base_loss", "fused_lasso_loss"] } initializer(self)
def from_params(cls, params: Params, vocab: Vocabulary) -> 'CMVMultiChannelPredictor': response_only_predictor = Model.from_params( params=params.pop('response_only_predictor'), vocab=vocab) op_response_predictor = Model.from_params( params=params.pop('op_response_predictor'), vocab=vocab) output_feedforward = FeedForward.from_params( params=params.pop('output_feedforward')) dropout = params.pop("dropout", 0) initializer = InitializerApplicator.from_params( params=params.pop('initializer', [])) regularizer = RegularizerApplicator.from_params( params=params.pop('regularizer', [])) params.assert_empty(cls.__name__) return cls(vocab, response_only_predictor, op_response_predictor, output_feedforward, dropout, initializer, regularizer)
def _get_predictor(args: argparse.Namespace) -> Predictor: check_for_gpu(args.cuda_device) params = Params.from_file(args.extractor_config_file) model = Model.from_params(vocab=None, params=params.pop('model')) if args.cuda_device >= 0: model.to(args.cuda_device) else: model.to(None) archive = Archive(model=model, config=params) return Predictor.from_archive( archive, args.predictor, dataset_reader_to_load=args.dataset_reader_choice)
def _test_model(self, file_name): params = self.params[file_name].duplicate() reader_params = params.duplicate().pop("reader", default=Params({})) if reader_params["type"] == "cnn_dailymail": reader_params["cnn_tokenized_dir"] = TEST_STORIES_DIR dataset_file = TEST_URLS_FILE elif reader_params["type"] == "ria": dataset_file = RIA_EXAMPLE_FILE else: assert False reader = DatasetReader.from_params(reader_params) tokenizer = reader._tokenizer dataset = reader.read(dataset_file) vocabulary_params = params.pop("vocabulary", default=Params({})) vocabulary = Vocabulary.from_params(vocabulary_params, instances=dataset) model_params = params.pop("model") model = Model.from_params(model_params, vocab=vocabulary) print(model) print("Trainable params count: ", sum(p.numel() for p in model.parameters() if p.requires_grad)) iterator = DataIterator.from_params(params.pop('iterator')) iterator.index_with(vocabulary) trainer = Trainer.from_params(model, None, iterator, dataset, None, params.pop('trainer')) trainer.train() model.eval() predictor = Seq2SeqPredictor(model, reader) for article, reference_sents in reader.parse_set(dataset_file): ref_words = [ token.text for token in tokenizer.tokenize(reference_sents) ] decoded_words = predictor.predict(article)["predicted_tokens"] self.assertGreaterEqual(len(decoded_words), len(ref_words)) unk_count = 0 while DEFAULT_OOV_TOKEN in decoded_words: unk_index = decoded_words.index(DEFAULT_OOV_TOKEN) decoded_words.pop(unk_index) unk_count += 1 if unk_index < len(ref_words): ref_words.pop(unk_index) self.assertLess(unk_count, 5) self.assertListEqual(decoded_words[:len(ref_words)], ref_words)
def _get_predictor(args: argparse.Namespace) -> Predictor: check_for_gpu(args.cuda_device) params = Params.from_file(args.scorer_config_file) archive = load_archive( args.archive_file, weights_file=args.weights_file, cuda_device=args.cuda_device, overrides=args.overrides, ) model = Model.from_params(vocab=None, model=archive.model, params=params) model.to(args.cuda_device) archive = Archive(model=model, config=archive.config) return Predictor.from_archive( archive, args.predictor, dataset_reader_to_load=args.dataset_reader_choice)
def instantiate_model_from_config(config_file_path: str, cuda_device: int = -1, overrides: str = None, include_package: str = "models") -> Model: logging.disable(logging.INFO) import_module_and_submodules(include_package) params = Params.from_file(config_file_path, overrides) vocab_dir = params.pop("vocabulary").pop("directory_path") vocab = Vocabulary.from_files(vocab_dir) model = Model.from_params(vocab=vocab, params=params.pop("model")) if cuda_device >= 0: model.cuda(cuda_device) else: model.cpu() return model
def load_model(model_name="conll_full_elmo"): """ Load both vocabulary and model and create and instance of HMTL full model. """ if model_name not in [ "conll_small_elmo", "conll_medium_elmo", "conll_full_elmo" ]: raise ValueError(f"{model_name} is not a valid name of model.") serialization_dir = "model_dumps" + "/" + model_name params = Params.from_file( params_file=os.path.join(serialization_dir, "config.json")) # Load TokenIndexer task_keys = [key for key in params.keys() if re.search("^task_", key)] token_indexer_params = params.pop(task_keys[-1]).pop("data_params").pop( "dataset_reader").pop("token_indexers") # see https://github.com/allenai/allennlp/issues/181 for better syntax token_indexers = {} for name, indexer_params in token_indexer_params.items(): token_indexers[name] = TokenIndexer.from_params(indexer_params) # Load the vocabulary logger.info("Loading Vocavulary from %s", os.path.join(serialization_dir, "vocabulary")) vocab = Vocabulary.from_files(os.path.join(serialization_dir, "vocabulary")) logger.info("Vocabulary loaded") # Create model and load weights model_params = params.pop("model") model = Model.from_params(vocab=vocab, params=model_params, regularizer=None) model_state_path = os.path.join(serialization_dir, "weights.th") model_state = torch.load(model_state_path, map_location="cpu") model.load_state_dict(state_dict=model_state) return model, vocab, token_indexers
def from_params( cls, # type: ignore params: Params, serialization_dir: str, recover: bool = False, ) -> "MultiTaskTrainer": readers = { name: DatasetReader.from_params(reader_params) for name, reader_params in params.pop( "train_dataset_readers").items() } train_file_paths = params.pop("train_file_paths").as_dict() datasets = { name: reader.read(train_file_paths[name]) for name, reader in readers.items() } instances = (instance for dataset in datasets.values() for instance in dataset) vocab = Vocabulary.from_params(Params({}), instances) model = Model.from_params(params.pop("model"), vocab=vocab) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(vocab) mingler = DatasetMingler.from_params(params.pop("mingler")) parameters = [[n, p] for n, p in model.named_parameters() if p.requires_grad] optimizer = Optimizer.from_params(parameters, params.pop("optimizer")) num_epochs = params.pop_int("num_epochs", 10) _ = params.pop("trainer", Params({})) params.assert_empty(__name__) return MultiTaskTrainer(model, serialization_dir, iterator, mingler, optimizer, datasets, num_epochs)
def modified_model_load(config: Params, serialization_dir: str, weights_file: str = None, cuda_device: int = -1) -> Model: """ Instantiates an already-trained model, based on the experiment configuration and some optional overrides. """ weights_file = weights_file or os.path.join(serialization_dir, _DEFAULT_WEIGHTS) # Load vocabulary from file vocab_dir = os.path.join(serialization_dir, 'vocabulary') # If the config specifies a vocabulary subclass, we need to use it. vocab = Vocabulary.from_files(vocab_dir) model_params = config.get('model') # The experiment config tells us how to _train_ a model, including where to get pre-trained # embeddings from. We're now _loading_ the model, so those embeddings will already be # stored in our weights. We don't need any pretrained weight file anymore, and we don't # want the code to look for it, so we remove it from the parameters here. remove_pretrained_embedding_params(model_params) model = Model.from_params(vocab=vocab, params=model_params) model_state = torch.load(weights_file, map_location=util.device_mapping(cuda_device)) model.load_state_dict(model_state, strict=False) # Force model to cpu or gpu, as appropriate, to make sure that the embeddings are # in sync with the weights if cuda_device >= 0: model.cuda(cuda_device) else: model.cpu() return model
def train_model(params: Params, serialization_dir: str, file_friendly_logging: bool = False, recover: bool = False) -> Model: """ Trains the model specified in the given :class:`Params` object, using the data and training parameters also specified in that object, and saves the results in ``serialization_dir``. Parameters ---------- params : ``Params`` A parameter object specifying an AllenNLP Experiment. serialization_dir : ``str`` The directory in which to save results and logs. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. recover : ``bool`, optional (default=False) If ``True``, we will try to recover a training run from an existing serialization directory. This is only intended for use when something actually crashed during the middle of a run. For continuing training a model on new data, see the ``fine-tune`` command. """ prepare_environment(params) create_serialization_dir(params, serialization_dir, recover) prepare_global_logging(serialization_dir, file_friendly_logging) serialization_params = deepcopy(params).as_dict(quiet=True) with open(os.path.join(serialization_dir, CONFIG_NAME), "w") as param_file: json.dump(serialization_params, param_file, indent=4) all_datasets = datasets_from_params(params) datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info("Creating a vocabulary using %s data.", ", ".join(datasets_for_vocab_creation)) vocab = Vocabulary.from_params(params.pop("vocabulary", {}), (instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation)) vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) model = Model.from_params(vocab, params.pop('model')) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(vocab) train_data = all_datasets['train'] validation_data = all_datasets.get('validation') test_data = all_datasets.get('test') trainer_params = params.pop("trainer") trainer = Trainer.from_params(model, serialization_dir, iterator, train_data, validation_data, trainer_params) evaluate_on_test = params.pop_bool("evaluate_on_test", False) params.assert_empty('base train command') try: metrics = trainer.train() except KeyboardInterrupt: # if we have completed an epoch, try to create a model archive. if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)): logging.info("Training interrupted by the user. Attempting to create " "a model archive using the current best epoch weights.") archive_model(serialization_dir, files_to_archive=params.files_to_archive) raise # Now tar up results archive_model(serialization_dir, files_to_archive=params.files_to_archive) if test_data and evaluate_on_test: test_metrics = evaluate(model, test_data, iterator, cuda_device=trainer._cuda_devices[0]) # pylint: disable=protected-access for key, value in test_metrics.items(): metrics["test_" + key] = value elif test_data: logger.info("To evaluate on the test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.") metrics_json = json.dumps(metrics, indent=2) with open(os.path.join(serialization_dir, "metrics.json"), "w") as metrics_file: metrics_file.write(metrics_json) logger.info("Metrics: %s", metrics_json) return model
def train_model(params: Params, serialization_dir: str, file_friendly_logging: bool = False, recover: bool = False) -> Model: """ Trains the model specified in the given :class:`Params` object, using the data and training parameters also specified in that object, and saves the results in ``serialization_dir``. Parameters ---------- params : ``Params`` A parameter object specifying an AllenNLP Experiment. serialization_dir : ``str`` The directory in which to save results and logs. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. recover : ``bool``, optional (default=False) If ``True``, we will try to recover a training run from an existing serialization directory. This is only intended for use when something actually crashed during the middle of a run. For continuing training a model on new data, see the ``fine-tune`` command. Returns ------- best_model: ``Model`` The model with the best epoch weights. """ prepare_environment(params) create_serialization_dir(params, serialization_dir, recover) prepare_global_logging(serialization_dir, file_friendly_logging) check_for_gpu(params.params.get('trainer').get('cuda_device', -1)) serialization_params = deepcopy(params).as_dict(quiet=True) with open(os.path.join(serialization_dir, CONFIG_NAME), "w") as param_file: json.dump(serialization_params, param_file, indent=4) all_datasets = datasets_from_params(params) datasets_for_vocab_creation = set( params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError( f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info("Creating a vocabulary using %s data.", ", ".join(datasets_for_vocab_creation)) vocab = Vocabulary.from_params( params.pop("vocabulary", {}), (instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation)) vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) model = Model.from_params(vocab=vocab, params=params.pop('model')) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(vocab) validation_iterator_params = params.pop("validation_iterator", None) if validation_iterator_params: validation_iterator = DataIterator.from_params( validation_iterator_params) validation_iterator.index_with(vocab) else: validation_iterator = None train_data = all_datasets['train'] validation_data = all_datasets.get('validation') test_data = all_datasets.get('test') trainer_params = params.pop("trainer") no_grad_regexes = trainer_params.pop("no_grad", ()) for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad_regexes): parameter.requires_grad_(False) frozen_parameter_names, tunable_parameter_names = \ get_frozen_and_tunable_parameter_names(model) logger.info("Following parameters are Frozen (without gradient):") for name in frozen_parameter_names: logger.info(name) logger.info("Following parameters are Tunable (with gradient):") for name in tunable_parameter_names: logger.info(name) trainer = Trainer.from_params(model, serialization_dir, iterator, train_data, validation_data, trainer_params, validation_iterator=validation_iterator) evaluate_on_test = params.pop_bool("evaluate_on_test", False) params.assert_empty('base train command') try: metrics = trainer.train() except KeyboardInterrupt: # if we have completed an epoch, try to create a model archive. if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)): logging.info( "Training interrupted by the user. Attempting to create " "a model archive using the current best epoch weights.") archive_model(serialization_dir, files_to_archive=params.files_to_archive) raise # Now tar up results archive_model(serialization_dir, files_to_archive=params.files_to_archive) logger.info("Loading the best epoch weights.") best_model_state_path = os.path.join(serialization_dir, 'best.th') best_model_state = torch.load(best_model_state_path) best_model = model best_model.load_state_dict(best_model_state) if test_data and evaluate_on_test: logger.info( "The model will be evaluated using the best epoch weights.") test_metrics = evaluate( best_model, test_data, validation_iterator or iterator, cuda_device=trainer._cuda_devices[0] # pylint: disable=protected-access ) for key, value in test_metrics.items(): metrics["test_" + key] = value elif test_data: logger.info( "To evaluate on the test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.") metrics_json = json.dumps(metrics, indent=2) with open(os.path.join(serialization_dir, "metrics.json"), "w") as metrics_file: metrics_file.write(metrics_json) logger.info("Metrics: %s", metrics_json) return best_model
def from_params(cls, params: Params, serialization_dir: str, recover: bool = False, cache_directory: str = None, cache_prefix: str = None) -> 'PtDistTrainer': all_datasets = training_util.datasets_from_params( params, cache_directory, cache_prefix) vocab = Vocabulary.from_files(params.vocabulary.directory_path) model = Model.from_params(vocab=vocab, params=params.pop('model')) model.extend_embedder_vocab() if is_master_rank(): vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) train_data = all_datasets['train'] validation_data = all_datasets.get('validation') batch_size = params.iterator.batch_size trainer_params = params.pop("trainer") keys = [key for key in params] for key in keys: params.pop(key) params = trainer_params validation_metric = params.pop("validation_metric", "-loss") shuffle = params.pop_bool("shuffle", True) num_epochs = params.pop_int("num_epochs", 20) cuda_device = parse_cuda_device(params.pop("cuda_device", -1)) grad_clipping = params.pop_float("grad_clipping", None) lr_scheduler_params = params.pop("learning_rate_scheduler", None) pretrain_file = params.pop("pretrain_file", None) no_grad_regexes = params.pop("no_grad", ()) for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad_regexes): parameter.requires_grad_(False) frozen_parameter_names, tunable_parameter_names = \ get_frozen_and_tunable_parameter_names(model) logger.info("Following parameters are Frozen (without gradient):") for name in frozen_parameter_names: logger.info(name) logger.info("Following parameters are Tunable (with gradient):") for name in tunable_parameter_names: logger.info(name) model = model.cuda(dist.get_rank()) if pretrain_file: model_state = torch.load(pretrain_file, map_location=nn_util.device_mapping( dist.get_rank())) model.load_state_dict(model_state) parameters = [[n, p] for n, p in model.named_parameters() if p.requires_grad] # print([n for n, p in model.named_parameters() if p.requires_grad]) optimizer = Optimizer.from_params(parameters, params.pop("optimizer")) if lr_scheduler_params: lr_scheduler = LearningRateScheduler.from_params( optimizer, lr_scheduler_params) else: lr_scheduler = None num_serialized_models_to_keep = params.pop_int( "num_serialized_models_to_keep", 20) checkpointer = Checkpointer( serialization_dir=serialization_dir, num_serialized_models_to_keep=num_serialized_models_to_keep, keep_serialized_model_every_num_seconds=None) return cls(model, optimizer, train_data, validation_data, batch_size=batch_size, validation_metric=validation_metric, shuffle=shuffle, num_epochs=num_epochs, serialization_dir=serialization_dir, cuda_device=cuda_device, grad_clipping=grad_clipping, learning_rate_scheduler=lr_scheduler, checkpointer=checkpointer)
}}) params = Params.from_file(model_config, overrides) model_file = 'checkpoint/%s%s/' % (model_name, attn) iterator = DataIterator.from_params(params.pop("iterator")) torch.manual_seed(0) numpy.random.seed(0) if write_file: wf = Write_outfile(Wfile_name) print("Loading vocabulary") vocab = Vocabulary.from_files(model_file + 'vocabulary') print('Initialing model') model = Model.from_params(vocab=vocab, params=params.pop('model')) print("Loading Model file from %s" % (model_file + 'best.th')) with open(model_file + 'best.th', 'rb') as f: model.load_state_dict(torch.load(f, encoding='utf-8')) iterator.index_with(vocab) dataset_reader_params = params.pop('dataset_reader') datareader = DatasetReader.from_params(dataset_reader_params) model.eval() #读取文件数据 for file in files: dom = xml.dom.minidom.parse(file) root = dom.documentElement OrgQ_list = root.getElementsByTagName('OrgQuestion') q1_last = None
def train_model(params: Params, serialization_dir: str, file_friendly_logging: bool = False, recover: bool = False) -> Model: """ Trains the model specified in the given :class:`Params` object, using the data and training parameters also specified in that object, and saves the results in ``serialization_dir``. Parameters ---------- params : ``Params`` A parameter object specifying an AllenNLP Experiment. serialization_dir : ``str`` The directory in which to save results and logs. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. recover : ``bool`, optional (default=False) If ``True``, we will try to recover a training run from an existing serialization directory. This is only intended for use when something actually crashed during the middle of a run. For continuing training a model on new data, see the ``fine-tune`` command. """ prepare_environment(params) create_serialization_dir(params, serialization_dir, recover) # TODO(mattg): pull this block out into a separate function (maybe just add this to # `prepare_environment`?) Tqdm.set_slower_interval(file_friendly_logging) sys.stdout = TeeLogger( os.path.join(serialization_dir, "stdout.log"), # type: ignore sys.stdout, file_friendly_logging) sys.stderr = TeeLogger( os.path.join(serialization_dir, "stderr.log"), # type: ignore sys.stderr, file_friendly_logging) handler = logging.FileHandler( os.path.join(serialization_dir, "python_logging.log")) handler.setLevel(logging.INFO) handler.setFormatter( logging.Formatter( '%(asctime)s - %(levelname)s - %(name)s - %(message)s')) logging.getLogger().addHandler(handler) serialization_params = deepcopy(params).as_dict(quiet=True) with open(os.path.join(serialization_dir, CONFIG_NAME), "w") as param_file: json.dump(serialization_params, param_file, indent=4) all_datasets = datasets_from_params(params) datasets_for_vocab_creation = set( params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError( f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info("Creating a vocabulary using %s data.", ", ".join(datasets_for_vocab_creation)) vocab = Vocabulary.from_params( params.pop("vocabulary", {}), (instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation)) vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) model = Model.from_params(vocab, params.pop('model')) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(vocab) train_data = all_datasets['train'] validation_data = all_datasets.get('validation') test_data = all_datasets.get('test') trainer_params = params.pop("trainer") trainer = Trainer.from_params(model, serialization_dir, iterator, train_data, validation_data, trainer_params) evaluate_on_test = params.pop_bool("evaluate_on_test", False) params.assert_empty('base train command') metrics = trainer.train() # Now tar up results archive_model(serialization_dir, files_to_archive=params.files_to_archive) if test_data and evaluate_on_test: test_metrics = evaluate(model, test_data, iterator, cuda_device=trainer._cuda_devices[0]) # pylint: disable=protected-access for key, value in test_metrics.items(): metrics["test_" + key] = value elif test_data: logger.info( "To evaluate on the test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.") metrics_json = json.dumps(metrics, indent=2) with open(os.path.join(serialization_dir, "metrics.json"), "w") as metrics_file: metrics_file.write(metrics_json) logger.info("Metrics: %s", metrics_json) return model
def train_model(params: Params, serialization_dir: str, cuda_device: int, train_data_path: str, validation_data_path: str, test_data_path: str, file_friendly_logging: bool = False) -> Model: """ This function can be used as an entry point to running models in AllenNLP directly from a JSON specification using a :class:`Driver`. Note that if you care about reproducibility, you should avoid running code using Pytorch or numpy which affect the reproducibility of your experiment before you import and use this function, these libraries rely on random seeds which can be set in this function via a JSON specification file. Note that this function performs training and will also evaluate the trained model on development and test sets if provided in the parameter json. Parameters ---------- params: Params, required. A parameter object specifying an AllenNLP Experiment. serialization_dir: str, required The directory in which to save results and logs. """ prepare_environment(params) os.makedirs(serialization_dir, exist_ok=True) sys.stdout = TeeLogger( os.path.join(serialization_dir, "stdout.log"), # type: ignore sys.stdout, file_friendly_logging) sys.stderr = TeeLogger( os.path.join(serialization_dir, "stderr.log"), # type: ignore sys.stderr, file_friendly_logging) handler = logging.FileHandler( os.path.join(serialization_dir, "python_logging.log")) handler.setLevel(logging.INFO) handler.setFormatter( logging.Formatter( '%(asctime)s - %(levelname)s - %(name)s - %(message)s')) logging.getLogger().addHandler(handler) serialization_params = deepcopy(params).as_dict(quiet=True) with open(os.path.join(serialization_dir, "model_params.json"), "w") as param_file: json.dump(serialization_params, param_file, indent=4) # all_datasets = datasets_from_params(params) all_datasets = datasets_from_args(params, train_data_path, validation_data_path, test_data_path) datasets_for_vocab_creation = set( params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError( f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info("Creating a vocabulary using %s data.", ", ".join(datasets_for_vocab_creation)) vocab = Vocabulary.from_params( params.pop("vocabulary", {}), (instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation)) vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) model = Model.from_params(vocab, params.pop('model')) if cuda_device >= 0: model = model.cuda(cuda_device) # iterator = DataIterator.from_params(params.pop("iterator")) # iterator.index_with(vocab) train_iterator = DataIterator.from_params(params.pop("train_iterator")) val_iterator = DataIterator.from_params(params.pop("val_iterator")) train_iterator.index_with(vocab) val_iterator.index_with(vocab) train_data = all_datasets['train'] validation_data = all_datasets.get('validation') test_data = all_datasets.get('test') trainer_params = params.pop("trainer") trainer = Trainer.from_params(model, serialization_dir, train_iterator, val_iterator, cuda_device, train_data, validation_data, trainer_params) evaluate_on_test = params.pop_bool("evaluate_on_test", False) # params.assert_empty('base train command') metrics = trainer.train() # Now tar up results archive_model(serialization_dir, files_to_archive=params.files_to_archive) if test_data and evaluate_on_test: test_metrics = evaluate(model, test_data, val_iterator, cuda_device=trainer._cuda_devices[0]) # pylint: disable=protected-access for key, value in test_metrics.items(): metrics["test_" + key] = value elif test_data: logger.info( "To evaluate on the test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.") metrics_json = json.dumps(metrics, indent=2) with open(os.path.join(serialization_dir, "metrics.json"), "w") as metrics_file: metrics_file.write(metrics_json) logger.info("Metrics: %s", metrics_json) return model
def train_model(params: Params, serialization_dir: str, file_friendly_logging: bool = False, recover: bool = False, force: bool = False) -> Model: """ Trains the model specified in the given :class:`Params` object, using the data and training parameters also specified in that object, and saves the results in ``serialization_dir``. Parameters ---------- params : ``Params`` A parameter object specifying an AllenNLP Experiment. serialization_dir : ``str`` The directory in which to save results and logs. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. recover : ``bool``, optional (default=False) If ``True``, we will try to recover a training run from an existing serialization directory. This is only intended for use when something actually crashed during the middle of a run. For continuing training a model on new data, see the ``fine-tune`` command. Returns ------- best_model: ``Model`` The model with the best epoch weights. """ prepare_environment(params) create_serialization_dir(params, serialization_dir, recover, force) prepare_global_logging(serialization_dir, file_friendly_logging) cuda_device = params.params.get('trainer').get('cuda_device', -1) if isinstance(cuda_device, list): for device in cuda_device: check_for_gpu(device) else: check_for_gpu(cuda_device) params.to_file(os.path.join(serialization_dir, CONFIG_NAME)) all_datasets = datasets_from_params(params) datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info("From dataset instances, %s will be considered for vocabulary creation.", ", ".join(datasets_for_vocab_creation)) vocab = Vocabulary.from_params( params.pop("vocabulary", {}), (instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation) ) model = Model.from_params(vocab=vocab, params=params.pop('model')) # Initializing the model can have side effect of expanding the vocabulary vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(vocab) validation_iterator_params = params.pop("validation_iterator", None) if validation_iterator_params: validation_iterator = DataIterator.from_params(validation_iterator_params) validation_iterator.index_with(vocab) else: validation_iterator = None train_data = all_datasets['train'] validation_data = all_datasets.get('validation') test_data = all_datasets.get('test') trainer_params = params.pop("trainer") no_grad_regexes = trainer_params.pop("no_grad", ()) for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad_regexes): parameter.requires_grad_(False) frozen_parameter_names, tunable_parameter_names = \ get_frozen_and_tunable_parameter_names(model) logger.info("Following parameters are Frozen (without gradient):") for name in frozen_parameter_names: logger.info(name) logger.info("Following parameters are Tunable (with gradient):") for name in tunable_parameter_names: logger.info(name) trainer_choice = trainer_params.pop_choice("type", Trainer.list_available(), default_to_first_choice=True) trainer = Trainer.by_name(trainer_choice).from_params(model=model, serialization_dir=serialization_dir, iterator=iterator, train_data=train_data, validation_data=validation_data, params=trainer_params, validation_iterator=validation_iterator) evaluate_on_test = params.pop_bool("evaluate_on_test", False) params.assert_empty('base train command') try: metrics = trainer.train() except KeyboardInterrupt: # if we have completed an epoch, try to create a model archive. if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)): logging.info("Training interrupted by the user. Attempting to create " "a model archive using the current best epoch weights.") archive_model(serialization_dir, files_to_archive=params.files_to_archive) raise # Now tar up results archive_model(serialization_dir, files_to_archive=params.files_to_archive) logger.info("Loading the best epoch weights.") best_model_state_path = os.path.join(serialization_dir, 'best.th') best_model_state = torch.load(best_model_state_path) best_model = model best_model.load_state_dict(best_model_state) if test_data and evaluate_on_test: logger.info("The model will be evaluated using the best epoch weights.") test_metrics = evaluate( best_model, test_data, validation_iterator or iterator, cuda_device=trainer._cuda_devices[0] # pylint: disable=protected-access ) for key, value in test_metrics.items(): metrics["test_" + key] = value elif test_data: logger.info("To evaluate on the test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.") dump_metrics(os.path.join(serialization_dir, "metrics.json"), metrics, log=True) return best_model
vocab = Vocabulary.from_files( os.path.join(args.serialization_dir, "vocabulary")) logger.info("Vocabulary loaded") ### Load the data iterators ### task_list = create_and_set_iterators(params=params, task_list=task_list, vocab=vocab) ### Regularization ### regularizer = None ### Create model ### model_params = params.pop("model") model = Model.from_params(vocab=vocab, params=model_params, regularizer=regularizer) ### Real evaluation ### cuda_device = params.pop("multi_task_trainer").pop_int("cuda_device", -1) metrics = {task._name: {} for task in task_list} for task in task_list: if not task._evaluate_on_test: continue logger.info("Task %s will be evaluated using the best epoch weights.", task._name) assert ( task._test_data is not None ), "Task {} wants to be evaluated on test dataset but no there is no test data loaded.".format(
def train_model(params: Params, serialization_dir: str) -> Model: """ This function can be used as an entry point to running models in AllenNLP directly from a JSON specification using a :class:`Driver`. Note that if you care about reproducibility, you should avoid running code using Pytorch or numpy which affect the reproducibility of your experiment before you import and use this function, these libraries rely on random seeds which can be set in this function via a JSON specification file. Note that this function performs training and will also evaluate the trained model on development and test sets if provided in the parameter json. Parameters ---------- params: Params, required. A parameter object specifying an AllenNLP Experiment. serialization_dir: str, required The directory in which to save results and logs. """ prepare_environment(params) os.makedirs(serialization_dir, exist_ok=True) sys.stdout = TeeLogger(os.path.join(serialization_dir, "stdout.log"), sys.stdout) # type: ignore sys.stderr = TeeLogger(os.path.join(serialization_dir, "stderr.log"), sys.stderr) # type: ignore handler = logging.FileHandler(os.path.join(serialization_dir, "python_logging.log")) handler.setLevel(logging.INFO) handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(name)s - %(message)s')) logging.getLogger().addHandler(handler) serialization_params = deepcopy(params).as_dict(quiet=True) with open(os.path.join(serialization_dir, "model_params.json"), "w") as param_file: json.dump(serialization_params, param_file, indent=4) # Now we begin assembling the required parts for the Trainer. dataset_reader = DatasetReader.from_params(params.pop('dataset_reader')) train_data_path = params.pop('train_data_path') logger.info("Reading training data from %s", train_data_path) train_data = dataset_reader.read(train_data_path) all_datasets: List[Dataset] = [train_data] datasets_in_vocab = ["train"] validation_data_path = params.pop('validation_data_path', None) if validation_data_path is not None: logger.info("Reading validation data from %s", validation_data_path) validation_data = dataset_reader.read(validation_data_path) all_datasets.append(validation_data) datasets_in_vocab.append("validation") else: validation_data = None test_data_path = params.pop("test_data_path", None) if test_data_path is not None: logger.info("Reading test data from %s", test_data_path) test_data = dataset_reader.read(test_data_path) all_datasets.append(test_data) datasets_in_vocab.append("test") else: test_data = None logger.info("Creating a vocabulary using %s data.", ", ".join(datasets_in_vocab)) vocab = Vocabulary.from_params(params.pop("vocabulary", {}), Dataset([instance for dataset in all_datasets for instance in dataset.instances])) vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) model = Model.from_params(vocab, params.pop('model')) iterator = DataIterator.from_params(params.pop("iterator")) train_data.index_instances(vocab) if validation_data: validation_data.index_instances(vocab) trainer_params = params.pop("trainer") trainer = Trainer.from_params(model, serialization_dir, iterator, train_data, validation_data, trainer_params) evaluate_on_test = params.pop("evaluate_on_test", False) params.assert_empty('base train command') trainer.train() # Now tar up results archive_model(serialization_dir) if test_data and evaluate_on_test: test_data.index_instances(vocab) evaluate(model, test_data, iterator, cuda_device=trainer._cuda_device) # pylint: disable=protected-access elif test_data: logger.info("To evaluate on the test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.") return model