def _save_vocab_to_disk(self, vocab: Vocabulary) -> str: """Saves the vocab to disk to reuse it between the trials Parameters ---------- vocab Vocabulary to be saved to disk Returns ------- vocab_path Path to the vocabulary, that is a directory """ tmp_dir = tempfile.TemporaryDirectory() self._created_tmp_dirs.append(tmp_dir) vocab_path = tmp_dir.name vocab.save_to_files(vocab_path) # Make sure that we can load the vocab successfully try: Vocabulary.from_files(vocab_path) except Exception as exception: raise ValidationError( f"Could not load vocab saved in '{vocab_path}'") from exception return vocab_path
def predict_from_ensemble(trigger_prediction_dirs, trigger_vocab_dirs, arg_prediction_dir, arg_vocab_dir, test_file, output_file): trig_pred_dict = {k: get_pred_dicts(v) for k, v in trigger_prediction_dirs.items()} trigger_vocabs = [Vocabulary.from_files(v) for v in trigger_vocab_dirs] arg_preds = get_pred_dicts(arg_prediction_dir) arg_vocab = Vocabulary.from_files(arg_vocab_dir) gold = get_gold_data(test_file) assert all([set(arg_preds.keys()) == set(entry.keys()) for entry in trig_pred_dict.values()]) trig_preds = unwrap(trig_pred_dict) with open(output_file, "w") as f: for doc in trig_preds: one_pred = predict_one(trig_preds[doc], arg_preds[doc], gold[doc], trigger_vocabs, arg_vocab) f.write(one_pred + "\n")
def from_config( cls, config: Union[PipelineConfiguration, dict], vocab_path: Optional[str] = None, ) -> "Pipeline": """Creates a pipeline from a `PipelineConfiguration` object or a configuration dictionary Parameters ---------- config: `Union[PipelineConfiguration, dict]` A `PipelineConfiguration` object or a configuration dict vocab_path: `Optional[str]` If provided, the pipeline vocabulary will be loaded from this path Returns ------- pipeline: `Pipeline` A configured pipeline """ if isinstance(config, dict): config = PipelineConfiguration.from_dict(config) model = PipelineModel.from_params( Params({"config": config}), vocab=Vocabulary.from_files(vocab_path) if vocab_path is not None else None, ) if not isinstance(model, PipelineModel): raise TypeError(f"Cannot load model. Wrong format of {model}") cls._add_transformers_vocab_if_needed(model) return cls(model, config)
def _load(cls, config: Params, serialization_dir: str, weights_file: str = None, cuda_device: int = -1) -> 'Model': """ Instantiates an already-trained model, based on the experiment configuration and some optional overrides. """ weights_file = weights_file or os.path.join(serialization_dir, _DEFAULT_WEIGHTS) # Load vocabulary from file vocab_dir = os.path.join(serialization_dir, 'vocabulary') vocab = Vocabulary.from_files(vocab_dir) model_params = config.get('model') # The experiment config tells us how to _train_ a model, including where to get pre-trained # embeddings from. We're now _loading_ the model, so those embeddings will already be # stored in our weights. We don't need any pretrained weight file anymore, and we don't # want the code to look for it, so we remove it from the parameters here. remove_pretrained_embedding_params(model_params) model = Model.from_params(vocab=vocab, params=model_params) model_state = torch.load(weights_file, map_location=util.device_mapping(cuda_device)) model.load_state_dict(model_state) # Force model to cpu or gpu, as appropriate, to make sure that the embeddings are # in sync with the weights if cuda_device >= 0: model.cuda(cuda_device) else: model.cpu() return model
def main(cl_arguments): ''' Run REPL for a CoLA model ''' # Arguments handling # cl_args = handle_arguments(cl_arguments) args = config.params_from_file(cl_args.config_file, cl_args.overrides) check_arg_name(args) assert args.target_tasks == "cola", \ "Currently only supporting CoLA. ({})".format(args.target_tasks) if args.cuda >= 0: try: if not torch.cuda.is_available(): raise EnvironmentError("CUDA is not available, or not detected" " by PyTorch.") log.info("Using GPU %d", args.cuda) torch.cuda.set_device(args.cuda) except Exception: log.warning("GPU access failed. You might be using a CPU-only" " installation of PyTorch. Falling back to CPU.") args.cuda = -1 # Prepare data # _, target_tasks, vocab, word_embs = build_tasks(args) tasks = sorted(set(target_tasks), key=lambda x: x.name) # Build or load model # model = build_model(args, vocab, word_embs, tasks) log.info("Loading existing model from %s...", cl_args.model_file_path) load_model_state(model, cl_args.model_file_path, args.cuda, [], strict=False) # Inference Setup # model.eval() vocab = Vocabulary.from_files(os.path.join(args.exp_dir, 'vocab')) indexers = build_indexers(args) task = take_one(tasks) # Run Inference # if cl_args.inference_mode == "repl": assert cl_args.input_path is None assert cl_args.output_path is None print("Running REPL for task: {}".format(task.name)) run_repl(model, vocab, indexers, task, args) elif cl_args.inference_mode == "corpus": run_corpus_inference( model, vocab, indexers, task, args, cl_args.input_path, cl_args.input_format, cl_args.output_path, cl_args.eval_output_path, ) else: raise KeyError(cl_args.inference_mode)
def create_save_vocab(file_path, target_dir, word_min_count, char_min_count): namespace_word = "word2idx" namespace_char = "char2idx" token_indexers = { "tokens": SingleIdTokenIndexer(namespace=namespace_word), "chars": TokenCharactersIndexer(namespace=namespace_char) } min_count = { namespace_word: word_min_count, namespace_char: char_min_count } reader = SquadReader(token_indexers=token_indexers) instances = reader.read(file_path) vocab = Vocabulary.from_instances(instances, min_count=min_count) word_cnt = vocab.get_vocab_size(namespace_word) char_cnt = vocab.get_vocab_size(namespace_char) vocab.save_to_files(target_dir) print("save word2idx={}, char2idx={} to {}".format(word_cnt, char_cnt, target_dir)) word2idx = vocab.get_index_to_token_vocabulary(namespace_word) char2idx = vocab.get_index_to_token_vocabulary(namespace_char) print(char2idx) vocab = Vocabulary.from_files(target_dir) char2idx = vocab.get_index_to_token_vocabulary(namespace_char) print(char2idx) return
def get_data_loader(config): train_instances_path = Path(config.pop("train_instances_path")) valid_instances_path = Path(config.pop("valid_instances_path")) test_instances_path = Path(config.pop("test_instances_path")) create_vocab_s_nulya = False if train_instances_path.exists(): info("Loading tokenized instances") instances = [] for path in [train_instances_path, valid_instances_path]: with Path(path).open("rb") as f: instances.append(pickle.load(f)) else: info("Tokenizing instances...") data_reader = NMTDataReader() create_vocab_s_nulya = True train_instances = data_reader.read(config.pop("train_data_path")) valid_instances = data_reader.read(config.pop("valid_data_path")) test_instances = data_reader.read(config.pop("test_data_path")) instances = [train_instances, valid_instances] info("Saving instances to disk") train_instances_path.parent.mkdir(parents=True) for inst, path in zip( [train_instances, valid_instances, test_instances], [train_instances_path, valid_instances_path, test_instances_path], ): with path.open("wb") as f: pickle.dump(inst, f, protocol=pickle.HIGHEST_PROTOCOL) vocab_path = Path(config.pop("vocab_path")) if create_vocab_s_nulya or not vocab_path.exists(): max_vocab_size = config.pop("max_vocab_size") max_characters = config.pop("max_characters") vocab = Vocabulary.from_instances( instances[0], max_vocab_size={ "char_src": max_characters, "token_src": max_vocab_size, "char_trg": max_characters, "token_trg": max_vocab_size, }, ) vocab.save_to_files(vocab_path) else: vocab = Vocabulary.from_files(vocab_path) train_instances, valid_instances = instances return ( vocab, DataIteratorWrapper( vocab, train_instances, shuffle=True, batch_size=config.pop("train_batch_size"), ), DataIteratorWrapper( vocab, valid_instances, shuffle=False, batch_size=config.pop("valid_batch_size"), ), )
def __init__( self, config: Config, models: Dict[str, Type[nn.Module]], gpu_ids: List[int] = [0], cpu_workers: int = 0, ): self._C = config if self._C.PHASE != "program_prior": raise ValueError( f"Trying to initialize a ProgramPriorEvaluator, expected config PHASE to be " f"program_prior, found {self._C.PHASE}") # Initialize vocabulary, dataloader and model. self._vocabulary = Vocabulary.from_files(self._C.DATA.VOCABULARY) dataset = ProgramPriorDataset(self._C.DATA.VAL_TOKENS) dataloader = DataLoader(dataset, batch_size=self._C.OPTIM.BATCH_SIZE) super().__init__(config=config, dataloader=dataloader, models=models, gpu_ids=gpu_ids) # This will be a part of `self._models`, keep this handle for convenience. self._program_prior = self._models["program_prior"]
def __init__( self, config: Config, models: Dict[str, Type[nn.Module]], gpu_ids: List[int] = [0], cpu_workers: int = 0, ): self._C = config if self._C.PHASE != "question_coding": raise ValueError( f"Trying to initialize a QuestionCodingEvaluator, expected config PHASE to be " f"question_coding, found {self._C.PHASE}") # Initialize vocabulary, dataloader and model. self._vocabulary = Vocabulary.from_files(self._C.DATA.VOCABULARY) # There is no notion of "supervision" during evaluation. dataset = QuestionCodingDataset(self._C.DATA.VAL_TOKENS) dataloader = DataLoader(dataset, batch_size=self._C.OPTIM.BATCH_SIZE, num_workers=cpu_workers) super().__init__(config=config, dataloader=dataloader, models=models, gpu_ids=gpu_ids) # These will be a part of `self._models`, keep these handles for convenience. self._program_generator = self._models["program_generator"] self._question_reconstructor = self._models["question_reconstructor"]
def _load_elmo(serialization_dir, vocabulary_dir, weights_file=None, cuda_device=-1): config = Params.from_file(os.path.join(serialization_dir, 'config.json'), "") config.loading_from_archive = True weights_file = os.path.join(vocabulary_dir, 'best.th') vocab_dir = os.path.join(vocabulary_dir, 'vocabulary') vocab = Vocabulary.from_files(vocab_dir) model_params = config.get('model') remove_pretrained_embedding_params(model_params) model = BiDAFSelfAttention.from_params(vocab=vocab, params=model_params) model_state = torch.load(weights_file, map_location=device_mapping(cuda_device)) model.load_state_dict(model_state) if cuda_device >= 0: model.cuda(cuda_device) else: model.cpu() return model, config
def get_trainer_from_config( config: Params, train_instances: List[Instance], val_instances: List[Instance], device: int, serialization_dir: Optional[str] = None) -> Trainer: trainer_params = config.pop("trainer") trainer_params["cuda_device"] = device model_params = config.pop("model") vocab_dir = config.pop("vocab_dir", None) if vocab_dir is None: vocab = Vocabulary.from_instances(train_instances) else: vocab = Vocabulary.from_files(vocab_dir) model = Model.from_params(model_params, vocab=vocab) iterator = DataIterator.from_params(config.pop("iterator")) trainer_params["num_serialized_models_to_keep"] = 1 iterator.index_with(vocab) trainer = Trainer.from_params(model=model, iterator=iterator, train_data=train_instances, validation_data=val_instances, serialization_dir=serialization_dir, params=trainer_params) return trainer
def _load(cls, config , serialization_dir , weights_file = None, cuda_device = -1) : u""" Instantiates an already-trained model, based on the experiment configuration and some optional overrides. """ weights_file = weights_file or os.path.join(serialization_dir, _DEFAULT_WEIGHTS) # Load vocabulary from file vocab_dir = os.path.join(serialization_dir, u'vocabulary') vocab = Vocabulary.from_files(vocab_dir) model_params = config.get(u'model') # The experiment config tells us how to _train_ a model, including where to get pre-trained # embeddings from. We're now _loading_ the model, so those embeddings will already be # stored in our weights. We don't need any pretrained weight file anymore, and we don't # want the code to look for it, so we remove it from the parameters here. remove_pretrained_embedding_params(model_params) model = Model.from_params(vocab=vocab, params=model_params) model_state = torch.load(weights_file, map_location=util.device_mapping(cuda_device)) model.load_state_dict(model_state) # Force model to cpu or gpu, as appropriate, to make sure that the embeddings are # in sync with the weights if cuda_device >= 0: model.cuda(cuda_device) else: model.cpu() return model
def __init__(self, default_seeds: Iterable = None, quiet: bool = False): super().__init__(default_seeds, quiet) lm_files = download_files(fnames=['vocabulary.zip', 'lm-fold-0.bin'], local_folder='swag_lm') activity_data_files = download_files( fnames=['captions.zip'], paths='https://cs.stanford.edu/people/ranjaykrishna/densevid/', local_folder='activitynet_captions') const_parser_files = cached_path( 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo-constituency-parser-2018.03.14.tar.gz', cache_dir=str(DATA_ROOT / 'allennlp_constituency_parser')) self.const_parser = PretrainedModel(const_parser_files, 'constituency-parser').predictor() vocab = Vocabulary.from_files(lm_files[0]) self.language_model = SimpleBiLM(vocab=vocab, recurrent_dropout_probability=0.2, embedding_dropout_probability=0.2) optimistic_restore( self.language_model, torch.load(lm_files[1], map_location='cpu')['state_dict']) if default_seeds is None: self.default_seeds = ActivityNetCaptionsDatasetReader().read( activity_data_files[0] + '/train.json') else: self.default_seeds = default_seeds
def load(cls, config: Params, serialization_prefix: str = None, weights_file: str = None, cuda_device: int = -1) -> 'Model': """ Instantiates an already-trained model, based on the experiment configuration and some optional overrides. Parameters ---------- config: Params The configuration that was used to train the model. It should definitely have a `model` section, and should probably have a `trainer` section as well. serialization_prefix: str = None By default we look at `config['trainer']['serialization_prefix']` to get the path to the serialized model, but you can override that value here. weights_file: str = None By default we load the weights from `best.th` in the serialization directory, but you can override that value here. cuda_device: int = -1 By default we load the model on the CPU, but if you want to load it for GPU usage you can specify the id of your GPU here Returns ------- model: Model The model specified in the configuration, loaded with the serialized vocabulary and the trained weights. """ trainer_config = config.get("trainer", {}) serialization_prefix = (serialization_prefix or trainer_config.get('serialization_prefix')) if serialization_prefix is None: raise ConfigurationError('serialization_prefix must be specified') weights_file = weights_file or os.path.join(serialization_prefix, _DEFAULT_WEIGHTS) # Load vocabulary from file vocab_dir = os.path.join(serialization_prefix, 'vocabulary') vocab = Vocabulary.from_files(vocab_dir) model = Model.from_params(vocab, config.get('model')) model_state = torch.load(weights_file, map_location=device_mapping(cuda_device)) model.load_state_dict(model_state) # Force model to cpu or gpu, as appropriate, to make sure that the embeddings are # in sync with the weights if cuda_device >= 0: model.cuda(cuda_device) else: model.cpu() return model
def load(cls, config: Params, serialization_dir: str, weights_file: str = None, cuda_device: int = -1) -> 'Model': """ Instantiates an already-trained model, based on the experiment configuration and some optional overrides. Parameters ---------- config: Params The configuration that was used to train the model. It should definitely have a `model` section, and should probably have a `trainer` section as well. serialization_dir: str = None The directory containing the serialized weights, parameters, and vocabulary of the model. weights_file: str = None By default we load the weights from `best.th` in the serialization directory, but you can override that value here. cuda_device: int = -1 By default we load the model on the CPU, but if you want to load it for GPU usage you can specify the id of your GPU here Returns ------- model: Model The model specified in the configuration, loaded with the serialized vocabulary and the trained weights. """ weights_file = weights_file or os.path.join(serialization_dir, _DEFAULT_WEIGHTS) # Load vocabulary from file vocab_dir = os.path.join(serialization_dir, 'vocabulary') vocab = Vocabulary.from_files(vocab_dir) model_params = config.get('model') # The experiment config tells us how to _train_ a model, including where to get pre-trained # embeddings from. We're now _loading_ the model, so those embeddings will already be # stored in our weights. We don't need any pretrained weight file anymore, and we don't # want the code to look for it, so we remove it from the parameters here. _remove_pretrained_embedding_params(model_params) model = Model.from_params(vocab, model_params) model_state = torch.load(weights_file, map_location=device_mapping(cuda_device)) model.load_state_dict(model_state) # Force model to cpu or gpu, as appropriate, to make sure that the embeddings are # in sync with the weights if cuda_device >= 0: model.cuda(cuda_device) else: model.cpu() return model
def build_vocab(instances: Iterable[Instance]) -> Vocabulary: print('Building the vocabulary') vocab_path = os.path.join(PRETRAIN_MODEL, 'vocab') if os.path.exists(vocab_path): vocab = Vocabulary.from_files(vocab_path) else: vocab = Vocabulary.from_instances(instances) vocab.save_to_files(vocab_path) return vocab
def _load_vocab(archive_path: Path) -> Vocabulary: # an annoying hack to load the vocab file tempdir = tempfile.mkdtemp() with tarfile.open(archive_path, 'r:gz') as _archive: _archive.extractall(tempdir) vocab_path = Path(tempdir) / "vocabulary" vocab = Vocabulary.from_files(vocab_path) shutil.rmtree(tempdir) return vocab
def load(cls, config: Params, serialization_dir: str, weights_file: str = None, cuda_device: int = -1) -> 'Model': """ Instantiates an already-trained model, based on the experiment configuration and some optional overrides. Parameters ---------- config: Params The configuration that was used to train the model. It should definitely have a `model` section, and should probably have a `trainer` section as well. serialization_dir: str = None The directory containing the serialized weights, parameters, and vocabulary of the model. weights_file: str = None By default we load the weights from `best.th` in the serialization directory, but you can override that value here. cuda_device: int = -1 By default we load the model on the CPU, but if you want to load it for GPU usage you can specify the id of your GPU here Returns ------- model: Model The model specified in the configuration, loaded with the serialized vocabulary and the trained weights. """ weights_file = weights_file or os.path.join(serialization_dir, _DEFAULT_WEIGHTS) # Load vocabulary from file vocab_dir = os.path.join(serialization_dir, 'vocabulary') vocab = Vocabulary.from_files(vocab_dir) model_params = config.get('model') # The experiment config tells us how to _train_ a model, including where to get pre-trained # embeddings from. We're now _loading_ the model, so those embeddings will already be # stored in our weights. We don't need any pretrained weight file anymore, and we don't # want the code to look for it, so we remove it from the parameters here. _remove_pretrained_embedding_params(model_params) model = Model.from_params(vocab, model_params) model_state = torch.load(weights_file, map_location=util.device_mapping(cuda_device)) model.load_state_dict(model_state) # Force model to cpu or gpu, as appropriate, to make sure that the embeddings are # in sync with the weights if cuda_device >= 0: model.cuda(cuda_device) else: model.cpu() return model
def _load_files(self): self.config = Params.from_file(os.path.join(self.tempdir, _CONFIG_NAME)) self.vocabulary = Vocabulary.from_files( os.path.join(self.tempdir, _VOCAB_DIR_NAME)) self.vocab_reader = VocabReader( os.path.join(self.tempdir, _VOCAB_DIR_NAME, _TOKENS_NAME)) weights_path = os.path.join(self.tempdir, _WEIGHTS_NAME) self.model_state = torch.load( weights_path, map_location=lambda storage, loc: storage)
def __init__( self, vocab: Vocabulary, # source-side bert_encoder: BaseBertWrapper, encoder_token_embedder: TextFieldEmbedder, encoder_pos_embedding: Embedding, encoder: Seq2SeqEncoder, syntax_edge_type_namespace: str = None, biaffine_parser: DeepTreeParser = None, dropout: float = 0.0, eps: float = 1e-20, pretrained_weights: str = None, vocab_dir: str = None, ) -> None: super(UDParser, self).__init__(vocab=vocab, bert_encoder=bert_encoder, encoder_token_embedder=encoder_token_embedder, encoder=encoder, decoder_token_embedder=None, decoder_node_index_embedding=None, decoder=None, extended_pointer_generator=None, tree_parser=None, label_smoothing=None, target_output_namespace=None, pretrained_weights=pretrained_weights, dropout=dropout, eps=eps) # source-side self.encoder_pos_embedding = encoder_pos_embedding # misc self._syntax_edge_type_namespace = syntax_edge_type_namespace self.biaffine_parser = biaffine_parser self.vocab_dir = vocab_dir #metrics self._syntax_metrics = AttachmentScores() self.syntax_las = 0.0 self.syntax_uas = 0.0 # compatibility self.loss_mixer = None self.syntactic_method = "encoder-side" # pretrained if self.pretrained_weights is not None: self.load_partial(self.pretrained_weights) # load vocab if self.vocab_dir is not None: syn_vocab = Vocabulary.from_files(vocab_dir) self.vocab._token_to_index[ self._syntax_edge_type_namespace] = syn_vocab._token_to_index[ self._syntax_edge_type_namespace]
def from_config(cls, config: Config): r"""Instantiate this class directly from a :class:`~probnmn.config.Config`.""" _C = config return cls( # type: ignore vocabulary=Vocabulary.from_files(_C.DATA.VOCABULARY), image_feature_size=tuple(_C.NMN.IMAGE_FEATURE_SIZE), module_channels=_C.NMN.MODULE_CHANNELS, class_projection_channels=_C.NMN.CLASS_PROJECTION_CHANNELS, classifier_linear_size=_C.NMN.CLASSIFIER_LINEAR_SIZE, )
def from_config(cls, config: Config): r"""Instantiate this class directly from a :class:`~probnmn.config.Config`.""" _C = config return cls( vocabulary=Vocabulary.from_files(_C.DATA.VOCABULARY), input_size=_C.QUESTION_RECONSTRUCTOR.INPUT_SIZE, hidden_size=_C.QUESTION_RECONSTRUCTOR.HIDDEN_SIZE, num_layers=_C.QUESTION_RECONSTRUCTOR.NUM_LAYERS, dropout=_C.QUESTION_RECONSTRUCTOR.DROPOUT, )
def train(model_dir): # prepare data #reader = CoqaDatasetReader() #reader = CoqaDatasetReader(tokenizer=lambda x: WordTokenizer().tokenize(text=x)) reader = CoqaDatasetReader(tokenizer=lambda sent: SpacyWordSplitter( language='en_core_web_sm').split_words(sent)) train_dataset = reader.read( cached_path('/mnt/DATA/ML/data/corpora/QA/CoQA/coqa-train-v1.0.json')) validation_dataset = reader.read( cached_path('/mnt/DATA/ML/data/corpora/QA/CoQA/coqa-dev-v1.0.json')) vocab = None model_fn = os.path.join(model_dir, 'model.th') vocab_fn = os.path.join(model_dir, 'vocab') if os.path.exists(model_dir): if os.path.exists(vocab_fn): logging.info('load vocab from: %s...' % vocab_fn) vocab = Vocabulary.from_files(vocab_fn) else: os.makedirs(model_dir) if vocab is None: vocab = Vocabulary.from_instances(train_dataset + validation_dataset) logging.info('save vocab to: %s...' % vocab_fn) vocab.save_to_files(vocab_fn) logging.info('data prepared') model = create_model(vocab) if os.path.exists(model_fn): logging.info('load model wheights from: %s...' % model_fn) with open(model_fn, 'rb') as f: model.load_state_dict(torch.load(f)) logging.info('model prepared') # prepare training # optimizer = optim.SGD(model.parameters(), lr=0.1) optimizer = optim.Adam(model.parameters(), lr=0.01) iterator = BasicIterator(batch_size=2) iterator.index_with(vocab) trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, validation_dataset=validation_dataset, patience=10, num_epochs=100) logging.info('training prepared') trainer.train() logging.info('save model to: %s...' % model_fn) with open(model_fn, 'wb') as f: torch.save(model.state_dict(), f)
def predict_from_pair(trigger_prediction_dir, arg_prediction_dir, vocab_dir, test_file, output_file): trig_preds = get_pred_dicts(trigger_prediction_dir) arg_preds = get_pred_dicts(arg_prediction_dir) vocab = Vocabulary.from_files(vocab_dir) gold = get_gold_data(test_file) assert set(arg_preds.keys()) == set(trig_preds.keys()) with open(output_file, "w") as f: for doc in trig_preds: one_pred = predict_one(trig_preds[doc], arg_preds[doc], gold[doc], vocab) f.write(one_pred + "\n")
def _restore_vocab(self, folder: str) -> Vocabulary: # The transformers feature comes with its own vocab, no need to restore anything if it is the only feature if self.config.features.configured_namespaces == [ TransformersFeatures.namespace ]: return self.vocab self._check_for_word_vector_weights_file() vocab = Vocabulary.from_files(folder) self._model.extend_vocabulary(vocab) return vocab
def from_run(cls, run_dir: str, task_name: str, split_name: str): # Load vocabulary exp_dir = os.path.dirname(run_dir.rstrip("/")) vocab_path = os.path.join(exp_dir, "vocab") log.info("Loading vocabulary from %s" % vocab_path) vocab = Vocabulary.from_files(vocab_path) label_namespace = f"{task_name}_labels" # Load predictions preds_file = os.path.join(run_dir, f"{task_name}_{split_name}.json") log.info("Loading predictions from %s" % preds_file) return cls(vocab, utils.load_json_data(preds_file), label_namespace=label_namespace)
def __init__(self, vocab: Vocabulary = None, vocab_path: str = None, max_enc_len: int = 512, max_dec_len: int = 30, max_turn_len: int = 3, index_name: str = "tokens", start_token: str = "[CLS]", end_token: str = "[SEP]", pad_token: str = "[PAD]", oov_token: str = "[UNK]", do_lower_case: bool = True, token_indexers: Dict[str, TokenIndexer] = None, tokenizer: Tokenizer = None, never_split: List[str] = None, lazy: bool = False, **kwargs): super(PointerRewriteReader, self).__init__(lazy, **kwargs) if never_split is not None: never_split = [start_token, end_token, pad_token, oov_token ] + never_split else: never_split = [start_token, end_token, pad_token, oov_token] # the max length of the input self.max_length = max_enc_len # Tokens self._start_token = start_token self._end_token = end_token self._max_dec_len = max_dec_len self._max_turn_len = max_turn_len - 1 self._index_name = index_name if tokenizer is None: self._tokenizer = ChineseCharacterTokenizer( do_lower_case, never_split) else: self._tokenizer = tokenizer self._token_indexers = token_indexers or { index_name: SingleIdTokenIndexer(namespace=index_name) } if vocab is None: self.vocab = Vocabulary.from_files(vocab_path, padding_token=pad_token, oov_token=oov_token) else: self.vocab = vocab # get the vocabulary_to_index and oov id self._token2index = self.vocab.get_token_to_index_vocabulary( namespace=index_name) self._unk_id = self.vocab.get_token_index(self.vocab._oov_token, namespace=index_name) self._vocab_size = self.vocab.get_vocab_size(namespace=index_name)
def predict(self, file_path: str, vocab_path: str): ds = self.data_reader.read(file_path) vocab = Vocabulary.from_files(vocab_path) self.iterator.index_with(vocab) self.model.eval() pred_generator = self.iterator(ds, num_epochs=1, shuffle=False) pred_generator_tqdm = tqdm(pred_generator, total=self.iterator.get_num_batches(ds)) preds = [] with torch.no_grad(): for batch in pred_generator_tqdm: batch = util.move_to_device(batch, self.cuda_device) preds.append(self._extract_data(batch)) return preds
def setUp(self): logging.basicConfig(level=logging.INFO) tokenizer = WordTokenizer(JustSpacesWordSplitter()) reader = SummDataReader(tokenizer, source_max_tokens=400, lazy=False) self.train_dataset = reader.read( '../data/dev_bbc/train.dev.tsv.tagged') self.val_dataset = reader.read('../data/dev_bbc/val.dev.tsv.tagged') vocab_path = 'data/cnndm/vocab' if os.path.exists(vocab_path): self.vocab = Vocabulary.from_files(vocab_path) else: self.vocab = Vocabulary.from_instances(self.train_dataset, max_vocab_size=80000) self.vocab.save_to_files(vocab_path)
def get_shared_from_dir_allennlp(vocab_dir): '''get word2idx, char2idx by allennlp vocab Args: vocab_dir -- allennlp vocab saved dir Returns: word2idx -- char2idx -- ''' word_space = "word2idx" char_space = "char2idx" vocab = Vocabulary.from_files(vocab_dir) word2idx = vocab.get_token_to_index_vocabulary(word_space) char2idx = vocab.get_token_to_index_vocabulary(char_space) print("word={}, char={}".format(len(word2idx), len(char2idx))) return word2idx, char2idx
def get_parser(): dataset_path = 'data/datasets/spider' vocab = Vocabulary.from_files( 'parsers/irnet/checkpoints/v1.0_spider_baseline_model/vocabulary') overrides = { "dataset_path": dataset_path, "train_data_path": "train.json", "validation_data_path": "dev.json" } parser_model = load_archive( 'parsers/irnet/checkpoints/v1.0_spider_baseline_model/model.tar.gz', cuda_device=0, overrides=json.dumps(overrides)).model parser_model.sql_metric_util._evaluator.update_dataset_path( dataset_path=dataset_path) parser = IRNetSpiderParser(model=parser_model) return parser
def build_ontology_and_vocab(ontology_path: str, vocab_path: Optional[str] = None) -> Tuple[Vocabulary, Dict]: with open(ontology_path) as f: ontology = json.load(f) if vocab_path is None: vocab: Vocabulary = Vocabulary() vocab.add_token_to_namespace(token='None', namespace='span_labels') vocab.add_token_to_namespace(token='@@PADDING@@', namespace='span_labels') vocab.add_tokens_to_namespace([ role for role in ontology['args'].keys() ], namespace='span_labels') vocab.add_tokens_to_namespace([ event for event in ontology['events'].keys() ], namespace='event_labels') else: vocab: Vocabulary = Vocabulary.from_files(vocab_path) return vocab, ontology
def train_model(params: Params, serialization_dir: str, file_friendly_logging: bool = False, recover: bool = False, force: bool = False) -> Model: """ Trains the model specified in the given :class:`Params` object, using the data and training parameters also specified in that object, and saves the results in ``serialization_dir``. Parameters ---------- params : ``Params`` A parameter object specifying an AllenNLP Experiment. serialization_dir : ``str`` The directory in which to save results and logs. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. recover : ``bool``, optional (default=False) If ``True``, we will try to recover a training run from an existing serialization directory. This is only intended for use when something actually crashed during the middle of a run. For continuing training a model on new data, see the ``fine-tune`` command. force : ``bool``, optional (default=False) If ``True``, we will overwrite the serialization directory if it already exists. Returns ------- best_model: ``Model`` The model with the best epoch weights. """ prepare_environment(params) create_serialization_dir(params, serialization_dir, recover, force) prepare_global_logging(serialization_dir, file_friendly_logging) cuda_device = params.params.get('trainer').get('cuda_device', -1) if isinstance(cuda_device, list): for device in cuda_device: check_for_gpu(device) else: check_for_gpu(cuda_device) params.to_file(os.path.join(serialization_dir, CONFIG_NAME)) all_datasets = datasets_from_params(params) datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info("From dataset instances, %s will be considered for vocabulary creation.", ", ".join(datasets_for_vocab_creation)) if recover and os.path.exists(os.path.join(serialization_dir, "vocabulary")): vocab = Vocabulary.from_files(os.path.join(serialization_dir, "vocabulary")) else: vocab = Vocabulary.from_params( params.pop("vocabulary", {}), (instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation) ) model = Model.from_params(vocab=vocab, params=params.pop('model')) # Initializing the model can have side effect of expanding the vocabulary vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(vocab) validation_iterator_params = params.pop("validation_iterator", None) if validation_iterator_params: validation_iterator = DataIterator.from_params(validation_iterator_params) validation_iterator.index_with(vocab) else: validation_iterator = None train_data = all_datasets['train'] validation_data = all_datasets.get('validation') test_data = all_datasets.get('test') trainer_params = params.pop("trainer") no_grad_regexes = trainer_params.pop("no_grad", ()) for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad_regexes): parameter.requires_grad_(False) frozen_parameter_names, tunable_parameter_names = \ get_frozen_and_tunable_parameter_names(model) logger.info("Following parameters are Frozen (without gradient):") for name in frozen_parameter_names: logger.info(name) logger.info("Following parameters are Tunable (with gradient):") for name in tunable_parameter_names: logger.info(name) trainer_choice = trainer_params.pop_choice("type", Trainer.list_available(), default_to_first_choice=True) trainer = Trainer.by_name(trainer_choice).from_params(model=model, serialization_dir=serialization_dir, iterator=iterator, train_data=train_data, validation_data=validation_data, params=trainer_params, validation_iterator=validation_iterator) evaluate_on_test = params.pop_bool("evaluate_on_test", False) params.assert_empty('base train command') try: metrics = trainer.train() except KeyboardInterrupt: # if we have completed an epoch, try to create a model archive. if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)): logging.info("Training interrupted by the user. Attempting to create " "a model archive using the current best epoch weights.") archive_model(serialization_dir, files_to_archive=params.files_to_archive) raise # Now tar up results archive_model(serialization_dir, files_to_archive=params.files_to_archive) logger.info("Loading the best epoch weights.") best_model_state_path = os.path.join(serialization_dir, 'best.th') best_model_state = torch.load(best_model_state_path) best_model = model best_model.load_state_dict(best_model_state) if test_data and evaluate_on_test: logger.info("The model will be evaluated using the best epoch weights.") test_metrics = evaluate( best_model, test_data, validation_iterator or iterator, cuda_device=trainer._cuda_devices[0], # pylint: disable=protected-access, # TODO(brendanr): Pass in an arg following Joel's trainer refactor. batch_weight_key="" ) for key, value in test_metrics.items(): metrics["test_" + key] = value elif test_data: logger.info("To evaluate on the test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.") dump_metrics(os.path.join(serialization_dir, "metrics.json"), metrics, log=True) return best_model
def build_tasks(args): '''Prepare tasks''' def parse_tasks(task_list): '''parse string of tasks''' if task_list == 'all': tasks = ALL_TASKS elif task_list == 'none': tasks = [] else: tasks = task_list.split(',') return tasks train_task_names = parse_tasks(args.train_tasks) eval_task_names = parse_tasks(args.eval_tasks) all_task_names = list(set(train_task_names + eval_task_names)) tasks = get_tasks(all_task_names, args.max_seq_len, args.load_tasks) max_v_sizes = {'word': args.max_word_v_size} token_indexer = {} if args.elmo: token_indexer["elmo"] = ELMoTokenCharactersIndexer("elmo") if not args.elmo_no_glove: token_indexer["words"] = SingleIdTokenIndexer() else: token_indexer["words"] = SingleIdTokenIndexer() vocab_path = os.path.join(args.exp_dir, 'vocab') preproc_file = os.path.join(args.exp_dir, args.preproc_file) if args.load_preproc and os.path.exists(preproc_file): preproc = pkl.load(open(preproc_file, 'rb')) vocab = Vocabulary.from_files(vocab_path) word_embs = preproc['word_embs'] for task in tasks: train, val, test = preproc[task.name] task.train_data = train task.val_data = val task.test_data = test log.info("\tFinished building vocab. Using %d words", vocab.get_vocab_size('tokens')) log.info("\tLoaded data from %s", preproc_file) else: log.info("\tProcessing tasks from scratch") word2freq = get_words(tasks) vocab = get_vocab(word2freq, max_v_sizes) word_embs = get_embeddings(vocab, args.word_embs_file, args.d_word) preproc = {'word_embs': word_embs} for task in tasks: train, val, test = process_task(task, token_indexer, vocab) task.train_data = train task.val_data = val task.test_data = test del_field_tokens(task) preproc[task.name] = (train, val, test) log.info("\tFinished indexing tasks") pkl.dump(preproc, open(preproc_file, 'wb')) vocab.save_to_files(vocab_path) log.info("\tSaved data to %s", preproc_file) del word2freq del preproc train_tasks = [task for task in tasks if task.name in train_task_names] eval_tasks = [task for task in tasks if task.name in eval_task_names] log.info('\t Training on %s', ', '.join([task.name for task in train_tasks])) log.info('\t Evaluating on %s', ', '.join([task.name for task in eval_tasks])) return train_tasks, eval_tasks, vocab, word_embs