def from_params( cls, # type: ignore params: Params, serialization_dir: str, recover: bool = False) -> 'GanTestTrainer': dataset_reader = DatasetReader.from_params(params.pop("data_reader")) data = dataset_reader.read("") noise_reader = DatasetReader.from_params(params.pop("noise_reader")) noise = noise_reader.read("") generator = Model.from_params(params.pop("generator")) discriminator = Model.from_params(params.pop("discriminator")) iterator = DataIterator.from_params(params.pop("iterator")) noise_iterator = DataIterator.from_params(params.pop("noise_iterator")) generator_optimizer = Optimizer.from_params( [[n, p] for n, p in generator.named_parameters() if p.requires_grad], params.pop("generator_optimizer")) discriminator_optimizer = Optimizer.from_params( [[n, p] for n, p in discriminator.named_parameters() if p.requires_grad], params.pop("discriminator_optimizer")) num_epochs = params.pop_int("num_epochs") batches_per_epoch = params.pop_int("batches_per_epoch") params.pop("trainer") params.assert_empty(__name__) return cls(serialization_dir, data, noise, generator, discriminator, iterator, noise_iterator, generator_optimizer, discriminator_optimizer, batches_per_epoch, num_epochs)
def test_archiving(self): # copy params, since they'll get consumed during training params_copy = self.params.duplicate() params_dict_copy = copy.deepcopy(self.params.as_dict()) # `train_model` should create an archive serialization_dir = self.TEST_DIR / "archive_test" model = train_model(self.params, serialization_dir=serialization_dir) archive_path = serialization_dir / "model.tar.gz" # load from the archive archive = load_archive(archive_path) model2 = archive.model assert_models_equal(model, model2) assert isinstance( archive.dataset_reader, type(DatasetReader.from_params(params_copy["dataset_reader"].duplicate())), ) assert isinstance( archive.validation_dataset_reader, type(DatasetReader.from_params(params_copy["dataset_reader"].duplicate())), ) # validation_dataset_reader is not in the config, so fall back to dataset_reader # check that params are the same params2 = archive.config assert params2.as_dict() == params_dict_copy
def datasets_from_params(params: Params) -> Dict[str, Iterable[Instance]]: """ Load all the datasets specified by the config. """ dataset_reader = DatasetReader.from_params(params.pop('dataset_reader')) validation_dataset_reader_params = params.pop("validation_dataset_reader", None) validation_and_test_dataset_reader: DatasetReader = dataset_reader if validation_dataset_reader_params is not None: logger.info("Using a separate dataset reader to load validation and test data.") validation_and_test_dataset_reader = DatasetReader.from_params(validation_dataset_reader_params) train_data_path = params.pop('train_data_path') logger.info("Reading training data from %s", train_data_path) train_data = dataset_reader.read(train_data_path) datasets: Dict[str, Iterable[Instance]] = {"train": train_data} validation_data_path = params.pop('validation_data_path', None) if validation_data_path is not None: logger.info("Reading validation data from %s", validation_data_path) validation_data = validation_and_test_dataset_reader.read(validation_data_path) datasets["validation"] = validation_data test_data_path = params.pop("test_data_path", None) if test_data_path is not None: logger.info("Reading test data from %s", test_data_path) test_data = validation_and_test_dataset_reader.read(test_data_path) datasets["test"] = test_data return datasets
def read_all_datasets( train_data_path: str, dataset_reader: DatasetReader, validation_dataset_reader: DatasetReader = None, validation_data_path: str = None, test_data_path: str = None, ) -> Dict[str, Dataset]: """ Reads all datasets (perhaps lazily, if the corresponding dataset readers are lazy) and returns a dictionary mapping dataset name ("train", "validation" or "test") to the iterable resulting from `reader.read(filename)`. """ logger.info("Reading training data from %s", train_data_path) train_data = dataset_reader.read(train_data_path) datasets: Dict[str, Dataset] = {"train": train_data} validation_dataset_reader = validation_dataset_reader or dataset_reader if validation_data_path is not None: logger.info("Reading validation data from %s", validation_data_path) validation_data = validation_dataset_reader.read(validation_data_path) datasets["validation"] = validation_data if test_data_path is not None: logger.info("Reading test data from %s", test_data_path) test_data = validation_dataset_reader.read(test_data_path) datasets["test"] = test_data return datasets
def datasets_from_params(params: Params) -> Dict[str, Dataset]: """ Load all the datasets specified by the config. # Parameters params : `Params` cache_directory : `str`, optional If given, we will instruct the `DatasetReaders` that we construct to cache their instances in this location (or read their instances from caches in this location, if a suitable cache already exists). This is essentially a `base` directory for the cache, as we will additionally add the `cache_prefix` to this directory, giving an actual cache location of `cache_directory + cache_prefix`. cache_prefix : `str`, optional This works in conjunction with the `cache_directory`. The idea is that the `cache_directory` contains caches for all different parameter settings, while the `cache_prefix` captures a specific set of parameters that led to a particular cache file. That is, if you change the tokenization settings inside your `DatasetReader`, you don't want to read cached data that used the old settings. In order to avoid this, we compute a hash of the parameters used to construct each `DatasetReader` and use that as a "prefix" to the cache files inside the base `cache_directory`. So, a given `input_file` would be cached essentially as `cache_directory + cache_prefix + input_file`, where you specify a `cache_directory`, the `cache_prefix` is based on the dataset reader parameters, and the `input_file` is whatever path you provided to `DatasetReader.read()`. In order to allow you to give recognizable names to these prefixes if you want them, you can manually specify the `cache_prefix`. Note that in some rare cases this can be dangerous, as we'll use the `same` prefix for both train and validation dataset readers. """ dataset_reader_params = params.pop("dataset_reader") validation_dataset_reader_params = params.pop("validation_dataset_reader", None) dataset_reader = DatasetReader.from_params(dataset_reader_params) validation_and_test_dataset_reader: DatasetReader = dataset_reader if validation_dataset_reader_params is not None: logger.info("Using a separate dataset reader to load validation and test data.") validation_and_test_dataset_reader = DatasetReader.from_params( validation_dataset_reader_params ) train_data_path = params.pop("train_data_path") logger.info("Reading training data from %s", train_data_path) train_data = dataset_reader.read(train_data_path) datasets: Dict[str, Iterable[Instance]] = {"train": train_data} validation_data_path = params.pop("validation_data_path", None) if validation_data_path is not None: logger.info("Reading validation data from %s", validation_data_path) validation_data = validation_and_test_dataset_reader.read(validation_data_path) datasets["validation"] = validation_data test_data_path = params.pop("test_data_path", None) if test_data_path is not None: logger.info("Reading test data from %s", test_data_path) test_data = validation_and_test_dataset_reader.read(test_data_path) datasets["test"] = test_data return datasets
def datasets_from_params( params: Params, train: bool = True, validation: bool = True, test: bool = True, serialization_dir: Optional[Union[str, PathLike]] = None, ) -> Dict[str, Union["AllennlpDataset", "AllennlpLazyDataset"]]: """ Load datasets specified by the config. """ datasets: Dict[str, Union["AllennlpDataset", "AllennlpLazyDataset"]] = {} train = train and ("train_data_path" in params) validation = validation and ("validation_data_path" in params) test = test and ("test_data_path" in params) if not any((train, validation, test)): # Return early so don't unnecessarily initialize the train data reader. return datasets dataset_reader_params = params.pop("dataset_reader") dataset_reader = DatasetReader.from_params( dataset_reader_params, serialization_dir=serialization_dir) if train: train_data_path = params.pop("train_data_path") logger.info("Reading training data from %s", train_data_path) train_data = dataset_reader.read(train_data_path) datasets["train"] = train_data if not validation and not test: # Return early so we don't unnecessarily initialize the validation/test data # reader. return datasets validation_and_test_dataset_reader: DatasetReader = dataset_reader validation_dataset_reader_params = params.pop("validation_dataset_reader", None) if validation_dataset_reader_params is not None: logger.info( "Using a separate dataset reader to load validation and test data." ) validation_and_test_dataset_reader = DatasetReader.from_params( validation_dataset_reader_params, serialization_dir=serialization_dir) if validation: validation_data_path = params.pop("validation_data_path") logger.info("Reading validation data from %s", validation_data_path) validation_data = validation_and_test_dataset_reader.read( validation_data_path) datasets["validation"] = validation_data if test: test_data_path = params.pop("test_data_path") logger.info("Reading test data from %s", test_data_path) test_data = validation_and_test_dataset_reader.read(test_data_path) datasets["test"] = test_data return datasets
def _load_dataset_readers(config): dataset_reader_params = config.get("dataset_reader") # Try to use the validation dataset reader if there is one - otherwise fall back # to the default dataset_reader used for both training and validation. validation_dataset_reader_params = config.get( "validation_dataset_reader", dataset_reader_params.duplicate()) dataset_reader = DatasetReader.from_params(dataset_reader_params) validation_dataset_reader = DatasetReader.from_params( validation_dataset_reader_params) return dataset_reader, validation_dataset_reader
def multitask_datasets_from_params(params: Params) -> Dict[str, Iterable[Instance]]: """ Load all the datasets specified by the config. """ # In the multitask setting, the dataset types are indexed by the names. As # such, you can use a disjoint set of readers for the train, test, and # validation sets (just give them different names) readers = {name: DatasetReader.from_params(reader_params) for name, reader_params in params.pop("dataset_readers").items()} train_data_paths = params.pop('train_data_paths') validation_data_paths = params.pop('validation_data_paths', None) test_data_paths = params.pop("test_data_paths", None) datasets: Dict[str, Iterable[Instance]] = { "train": load_datasets(train_data_paths, readers) } if validation_data_paths is not None: datasets["validation"] = load_datasets(validation_data_paths, readers) if test_data_paths is not None: datasets["test"] = load_datasets(test_data_paths, readers) return datasets
def main(train_path, val_path, test_path, config_path, subword_model_path, out_dir): params = Params.from_file(config_path) reader_params = params.pop("reader", default=Params({})) reader = DatasetReader.from_params(reader_params) processor = SentencePieceProcessor() processor.Load(subword_model_path) train_text_file = os.path.join(out_dir, "train.text.txt") train_summary_file = os.path.join(out_dir, "train.summary.txt") val_text_file = os.path.join(out_dir, "val.text.txt") val_summary_file = os.path.join(out_dir, "val.summary.txt") test_text_file = os.path.join(out_dir, "test.text.txt") test_summary_file = os.path.join(out_dir, "test.summary.txt") files = ((train_path, train_text_file, train_summary_file), (val_path, val_text_file, val_summary_file), (test_path, test_text_file, test_summary_file)) for path, text_file_name, summary_file_name in files: with open(text_file_name, "w") as text_file, open(summary_file_name, "w") as summary_file: for text, summary in reader.parse_set(path): text_subwords = processor.EncodeAsPieces(text) summary_subwords = processor.EncodeAsPieces(summary) text_subwords.insert(0, "<t>") text_subwords.append("</t>") summary_subwords.insert(0, "<t>") summary_subwords.append("</t>") text_file.write(" ".join(text_subwords) + "\n") summary_file.write((" ".join(summary_subwords)) + "\n")
def preprocess(config_path, file_path, save_path, bert_path, max_src_tokens, max_tgt_tokens, lower=False, nrows=None): bert = BertData(bert_path, lower, max_src_tokens, max_tgt_tokens) params = Params.from_file(config_path) reader_params = params.pop("reader", default=Params({})) reader = DatasetReader.from_params(reader_params) data = [] for i, (text, summary) in enumerate(reader.parse_set(file_path)): if nrows is not None and i >= nrows: break src = [(s.text.lower() if lower else s.text).split() for s in sentenize(text)] tgt = [(s.text.lower() if lower else s.text).split() for s in sentenize(summary)] src_indices, tgt_indices, segments_ids, cls_ids, src_txt, tgt_txt = bert.preprocess( src, tgt) b_data_dict = { "src": src_indices, "tgt": tgt_indices, "segs": segments_ids, 'clss': cls_ids, 'src_txt': src_txt, "tgt_txt": tgt_txt } data.append(b_data_dict) torch.save(data, save_path)
def _predict_iter( self, data: Union[Iterable[Dict[str, Any]], List[Dict[str, Any]]] ) -> Iterable[Dict[str, Any]]: ''' Iterates over the predictions and yields one prediction at a time. This is a useful wrapper as it performs the data pre-processing and assertion checks. The predictions are predicted in batchs so that the model does not load in lots of data at once and thus have memory issues. :param data: Iterable or list of dictionaries that the predictor can take as input e.g. `target-tagger` predictor expects at most a `text` key and value. :yields: A dictionary containing all the values the model outputs e.g. For the `target_tagger` model it would return `logits`, `class_probabilities`, `mask`, and `tags`. :raises AssertionError: If the `model` attribute is None. This can be overcome by either fitting or loading a model. :raises TypeError: If the data given is not of Type List or Iterable. ''' no_model_error = 'There is no model to make predictions, either fit '\ 'or load a model to resolve this.' assert self.model, no_model_error self.model.eval() all_model_params = Params.from_file(self._param_fp) reader_params = all_model_params.get("dataset_reader") dataset_reader = DatasetReader.from_params(reader_params) predictor = Predictor.by_name(self._predictor_name)(self.model, dataset_reader) batch_size = 64 if 'iterator' in all_model_params: iter_params = all_model_params.get("iterator") if 'batch_size' in iter_params: batch_size = iter_params['batch_size'] # Data has to be an iterator if isinstance(data, list) or isinstance(data, collections.Iterable): data = iter(data) else: raise TypeError( f'Data given has to be of type {collections.Iterable}' f' and not {type(data)}') data_exists = True while data_exists: data_batch = [] for _ in range(batch_size): try: data_batch.append(next(data)) except StopIteration: data_exists = False if data_batch: predictions = predictor.predict_batch_json(data_batch) for prediction in predictions: yield prediction
def main(args): params = Params.from_file(args.config_path) stdout_handler = prepare_global_logging(args.output_dir, False) prepare_environment(params) reader = DatasetReader.from_params(params["dataset_reader"]) train_dataset = reader.read(params.pop("train_data_path", None)) valid_dataset = reader.read(params.pop("validation_data_path", None)) test_data_path = params.pop("test_data_path", None) if test_data_path: test_dataset = reader.read(test_data_path) vocab = Vocabulary.from_instances(train_dataset + valid_dataset + test_dataset) else: test_dataset = None vocab = Vocabulary.from_instances(train_dataset + valid_dataset) model_params = params.pop("model", None) model = Model.from_params(model_params.duplicate(), vocab=vocab) vocab.save_to_files(os.path.join(args.output_dir, "vocabulary")) # copy config file with open(args.config_path, "r", encoding="utf-8") as f_in: with open(os.path.join(args.output_dir, "config.json"), "w", encoding="utf-8") as f_out: f_out.write(f_in.read()) iterator = DataIterator.from_params(params.pop("iterator", None)) iterator.index_with(vocab) trainer_params = params.pop("trainer", None) trainer = Trainer.from_params(model=model, serialization_dir=args.output_dir, iterator=iterator, train_data=train_dataset, validation_data=valid_dataset, params=trainer_params.duplicate()) trainer.train() # evaluate on the test set if test_dataset: logging.info("Evaluating on the test set") import torch # import here to ensure the republication of the experiment model.load_state_dict( torch.load(os.path.join(args.output_dir, "best.th"))) test_metrics = evaluate(model, test_dataset, iterator, cuda_device=trainer_params.pop( "cuda_device", 0), batch_weight_key=None) logging.info(f"Metrics on the test set: {test_metrics}") with open(os.path.join(args.output_dir, "test_metrics.txt"), "w", encoding="utf-8") as f_out: f_out.write(f"Metrics on the test set: {test_metrics}") cleanup_global_logging(stdout_handler)
def data_to_tensors( data: TransactionsData, reader: DatasetReader, vocab: Vocabulary, device: Union[torch.device, int] = -1, ) -> ModelsInput: instances = Batch([reader.text_to_instance(**data.to_dict())]) instances.index_instances(vocab) inputs = instances.as_tensor_dict() return move_to_device(inputs, device)
def setUp(self): super().setUp() param_file = self.FIXTURES_ROOT / "simple_tagger" / "experiment_with_regularization.json" self.set_up_model(param_file, self.FIXTURES_ROOT / "data" / "sequence_tagging.tsv") params = Params.from_file(param_file) self.reader = DatasetReader.from_params(params["dataset_reader"]) self.iterator = DataIterator.from_params(params["iterator"]) self.trainer = Trainer.from_params(self.model, self.TEST_DIR, self.iterator, self.dataset, None, params.get("trainer"))
def setUp(self): super().setUp() param_file = self.FIXTURES_ROOT / 'simple_tagger' / 'experiment_with_regularization.json' self.set_up_model(param_file, self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv') params = Params.from_file(param_file) self.reader = DatasetReader.from_params(params['dataset_reader']) self.iterator = DataIterator.from_params(params['iterator']) self.trainer = Trainer.from_params(self.model, self.TEST_DIR, self.iterator, self.dataset, None, params.get('trainer'))
def datasets_from_params(params: Params) -> Dict[str, Iterable[Instance]]: # copied from allennlp.training.util.datasets_from_params dataset_reader = DatasetReader.from_params(params.pop('dataset_reader')) validation_dataset_reader_params = params.pop("validation_dataset_reader", None) validation_and_test_dataset_reader: DatasetReader = dataset_reader if validation_dataset_reader_params is not None: logger.info( "Using a separate dataset reader to load validation and test data." ) validation_and_test_dataset_reader = DatasetReader.from_params( validation_dataset_reader_params) train_data_path = params.pop('train_data_path') logger.info("Reading training data from %s", train_data_path) train_data = dataset_reader.read(train_data_path) datasets: Dict[str, Iterable[Instance]] = {"train": train_data} validation_data_path = params.pop('validation_data_path', None) if validation_data_path is not None: logger.info("Reading validation data from %s", validation_data_path) validation_data = validation_and_test_dataset_reader.read( validation_data_path) datasets["validation"] = validation_data test_data_path = params.pop("test_data_path", None) if test_data_path is not None: logger.info("Reading test data from %s", test_data_path) test_data = validation_and_test_dataset_reader.read(test_data_path) datasets["test"] = test_data train_low_data_path = params["trainer"].pop('low_data_path') logger.info("Reading training (low) data from %s", train_low_data_path) train_low_data = dataset_reader.read(train_low_data_path) datasets["train_low"] = train_low_data return datasets
def from_partial_objects( cls, serialization_dir: str, data_reader: DatasetReader, noise_reader: DatasetReader, generator: Model, discriminator: Model, iterator: DataIterator, noise_iterator: DataIterator, generator_optimizer: Lazy[Optimizer], discriminator_optimizer: Lazy[Optimizer], num_epochs: int, batches_per_epoch: int, ) -> "GanTestTrainer": data = data_reader.read("") noise = noise_reader.read("") generator_params = [[n, p] for n, p in generator.named_parameters() if p.requires_grad] generator_optimizer_ = generator_optimizer.construct( model_parameters=generator_params) discriminator_params = [[n, p] for n, p in discriminator.named_parameters() if p.requires_grad] discriminator_optimizer_ = discriminator_optimizer.construct( model_parameters=discriminator_params) return cls( serialization_dir, data, noise, generator, discriminator, iterator, noise_iterator, generator_optimizer_, discriminator_optimizer_, batches_per_epoch, num_epochs, )
def train_model( train_fp: Path, dev_fp: Path, model_fp: Path, vocab_data_fps: Optional[List[Path]] = None) -> Tuple[Model, Params]: ''' :param train_fp: The Traning dataset file path :param dev_fp: The development dataset file path :param model_fp: The json file that describes the model :param vocab_data_fps: An optional List of additional dataset files that will be used to create the models vocab :returns: A tuple containing the Trained model and an object that describes the model. ''' set_random_env() model_params = Params.from_file(model_fp) emotion_dataset_reader = DatasetReader.from_params( model_params.pop('dataset_reader')) # Data train_dataset = emotion_dataset_reader.read(cached_path(str(train_fp))) dev_dataset = emotion_dataset_reader.read(cached_path(str(dev_fp))) vocab_datasets = [train_dataset, dev_dataset] if vocab_data_fps: for vocab_data_fp in vocab_data_fps: vocab_datasets.append( emotion_dataset_reader.read(cached_path(str(vocab_data_fp)))) vocab_data = [] for vocab_dataset in vocab_datasets: vocab_data.extend(vocab_dataset) vocab = Vocabulary.from_instances(vocab_data) emotion_model = Model.from_params(vocab=vocab, params=model_params.pop('model')) data_iter = DataIterator.from_params(model_params.pop('iterator')) data_iter.index_with(vocab) # Trainer with tempfile.TemporaryDirectory() as serial_dir: trainer_params = model_params.pop('trainer') trainer = Trainer.from_params(model=emotion_model, serialization_dir=serial_dir, iterator=data_iter, train_data=train_dataset, validation_data=dev_dataset, params=trainer_params) _ = trainer.train() temp_config_fp = str(Path(serial_dir, CONFIG_NAME).resolve()) Params.from_file(model_fp).to_file(temp_config_fp) vocab.save_to_files(Path(serial_dir, "vocabulary").resolve()) archive_model(serial_dir, files_to_archive=model_params.files_to_archive) model_archive = load_archive(serial_dir, cuda_device=0) return model_archive.model, model_archive.config
def test_multi_iterator(self): params, file_paths = get_dataset_params_paths(['ner', 'ccg']) multitask_reader = DatasetReader.from_params(params) dataset = multitask_reader.read(file_paths) iterator_params = Params({ "type": "multitask_iterator", "iterators": { "ner": { "type": "bucket", "sorting_keys": [["tokens", "num_tokens"]], "padding_noise": 0.0, "batch_size": 2 }, "ccg": { "type": "basic", "batch_size": 1 } }, "names_to_index": ["ner", "ccg"], }) multi_iterator = DataIterator.from_params(iterator_params) # make the vocab vocab = Vocabulary.from_params(Params({}), (instance for instance in dataset)) multi_iterator.index_with(vocab) all_batches = [] for epoch in range(2): all_batches.append([]) for batch in multi_iterator(dataset, shuffle=True, num_epochs=1): all_batches[-1].append(batch) # 3 batches per epoch - self.assertEqual([len(b) for b in all_batches], [3, 3]) ner_batches = [] ccg_batches = [] for epoch_batches in all_batches: ner_batches.append(0) ccg_batches.append(0) for batch in epoch_batches: if 'original_pos_tags' not in batch: ner_batches[-1] += 1 if 'original_pos_tags' in batch: ccg_batches[-1] += 1 # 1 NER batch per epoch, 2 CCG per epoch self.assertEqual(ner_batches, [1, 1]) self.assertEqual(ccg_batches, [2, 2])
def __init__(self, archive_path: str, device: int = -1, batch_size: int = 32): archive_path = Path(archive_path) archive = load_archive(archive_path) self.params = archive.config self.model = archive.model.eval() self.batch_size = batch_size self.reader = DatasetReader.from_params(self.params.get("dataset_reader")) self.vocab = self._load_vocab(archive_path) self.idx2label = self.vocab.get_index_to_token_vocabulary('labels') if device != -1: self.model.to(f"cuda:{device}") super(AllenNLPLimePredictor, self).__init__(self.idx2label)
def run_config(config): params = Params(json.loads(config)) params_copy = params.duplicate() if "dataset_reader" in params: reader = DatasetReader.from_params(params.pop("dataset_reader")) else: raise RuntimeError("`dataset_reader` section is required") loader_params = params.pop("iterator") train_data_loader = DataIterator.from_params( reader=reader, data_path=params.pop("train_data_path"), params=loader_params.duplicate(), ) dev_data_loader = DataIterator.from_params( reader=reader, data_path=params.pop("validation_data_path"), params=loader_params, ) print("Building the vocabulary...") vocab = Vocabulary.from_instances(train_data_loader.iter_instances()) if "model" not in params: # 'dataset' mode — just preview the (first 10) instances print("Showing the first 10 instances:") for inst in train_data_loader.iter_instances(): print(inst) return None model = Model.from_params(vocab=vocab, params=params.pop("model")) train_data_loader.index_with(vocab) dev_data_loader.index_with(vocab) # set up a temporary, empty directory for serialization with tempfile.TemporaryDirectory() as serialization_dir: trainer = Trainer.from_params( model=model, serialization_dir=serialization_dir, data_loader=train_data_loader, validation_data_loader=dev_data_loader, params=params.pop("trainer"), ) trainer.train() return { "params": params_copy, "dataset_reader": reader, "vocab": vocab, "model": model, }
def from_dataset_reader( cls, reader: DatasetReader, data_path: str, batch_size: int, shuffle: bool = False, batches_per_epoch: Optional[int] = None, ) -> "SimpleDataLoader": instances = list(reader.read(data_path)) return cls(instances, batch_size, shuffle=shuffle, batches_per_epoch=batches_per_epoch)
def main(train_path, val_path, test_path, config_path, subword_model_path, out_dir, max_text_subwords, max_summary_subwords, source_suffix, target_suffix, insert_tags=False, lowercase=False): params = Params.from_file(config_path) reader_params = params.pop("dataset_reader", default=Params({})) reader = DatasetReader.from_params(reader_params) processor = SentencePieceProcessor() processor.Load(subword_model_path) train_text_file = os.path.join(out_dir, "train.{}".format(source_suffix)) train_summary_file = os.path.join(out_dir, "train.{}".format(target_suffix)) val_text_file = os.path.join(out_dir, "val.{}".format(source_suffix)) val_summary_file = os.path.join(out_dir, "val.{}".format(target_suffix)) test_text_file = os.path.join(out_dir, "test.{}".format(source_suffix)) test_summary_file = os.path.join(out_dir, "test.{}".format(target_suffix)) files = ((train_path, train_text_file, train_summary_file), (val_path, val_text_file, val_summary_file), (test_path, test_text_file, test_summary_file)) for path, text_file_name, summary_file_name in files: with open(text_file_name, "w") as text_file, open(summary_file_name, "w") as summary_file: for text, summary in reader.parse_set(path): if lowercase: text = text.lower() summary = summary.lower() text_subwords = processor.EncodeAsPieces(text) if max_text_subwords: text_subwords = text_subwords[:max_text_subwords] summary_subwords = processor.EncodeAsPieces(summary) if max_summary_subwords: summary_subwords = summary_subwords[:max_summary_subwords] if insert_tags: text_subwords.insert(0, "<t>") text_subwords.append("</t>") summary_subwords.insert(0, "<t>") summary_subwords.append("</t>") text_file.write(" ".join(text_subwords) + "\n") summary_file.write((" ".join(summary_subwords)) + "\n")
def load_predictor(path): if path.endswith(".tar.gz"): return Predictor.from_path(path, "ja_seq2seq") elif path.endswith(".th"): serialization_dir = str(Path(path).parent) params = Params.from_file(str(serialization_dir + "/config.json")) model = Model.load(params, str(serialization_dir), weights_file=path) dataset_reader = DatasetReader.from_params( params.get("dataset_reader")) return JaSeq2SeqPredictor(model, dataset_reader) else: raise ValueError
def setup_datasets(params: Params) -> Dict[str, Iterable[Instance]]: dataset_reader_params = params.get('dataset_reader') validation_dataset_reader_params = params.get('validation_dataset_reader', None) dataset_reader = DatasetReader.from_params(dataset_reader_params) validation_and_test_dataset_reader: DatasetReader = dataset_reader if validation_dataset_reader_params is not None: validation_and_test_dataset_reader = DatasetReader.from_params(validation_dataset_reader_params) train_data_path = params.get('train_data_path') train_data = dataset_reader.read(train_data_path) datasets: Dict[str, Iterable[Instance]] = {"train": train_data} validation_data_path = params.get('validation_data_path', None) if validation_data_path is not None: validation_data = validation_and_test_dataset_reader.read(validation_data_path) datasets["validation"] = validation_data test_data_path = params.get("test_data_path", None) if test_data_path is not None: test_data = validation_and_test_dataset_reader.read(test_data_path) datasets["test"] = test_data return datasets
def setUp(self): super().setUp() param_file = self.FIXTURES_ROOT / "simple_tagger" / "experiment_with_regularization.json" self.set_up_model(param_file, self.FIXTURES_ROOT / "data" / "sequence_tagging.tsv") params = Params.from_file(param_file) self.reader = DatasetReader.from_params(params["dataset_reader"]) self.data_loader = DataLoader.from_params(dataset=self.instances, params=params["data_loader"]) self.trainer = Trainer.from_params( model=self.model, data_loader=self.data_loader, serialization_dir=self.TEST_DIR, params=params.get("trainer"), )
def from_config(cls, config: Params, predictor_name: str = None) -> 'Predictor': dataset_reader_params = config["dataset_reader"] dataset_reader = DatasetReader.from_params(dataset_reader_params) tokenizer = dataset_reader._tokenizer or WordTokenizer() # pylint: disable=protected-access token_indexers = dataset_reader._token_indexers # pylint: disable=protected-access model_name = config.get("model").get("type") model = Model.load(config) model.eval() predictor_name = predictor_name or DEFAULT_PREDICTORS[model_name] return Predictor.by_name(predictor_name)(model, tokenizer, token_indexers)
def main(args): config = Params.from_file(args.config_file) reader_config = config.pop("dataset_reader") reader = DatasetReader.from_params(reader_config) x = reader.read(args.input_file) if args.output_file is None: output_file = sys.stdout else: output_file = open(args.output_file, mode='w') for instance in x: tokens = instance.fields['tokens'].tokens tags = instance.fields['tags'].labels text = [t.text for t in tokens] for token, tag in zip(text, tags): print(f'{token}\t{tag}', file=output_file) print('', file=output_file)
def main(args): config = Params.from_file(args.config_file) reader_config = config.pop("dataset_reader") reader = DatasetReader.from_params(reader_config) x = reader.read(args.input_file) if args.output_file is None: outfile = sys.stdout else: outfile = open(args.output_file, mode='w') # print(len(x.instances)) for instance in x: tokens = instance.fields['tokens'].tokens text = ' '.join([t.text for t in tokens]) line = str(json.dumps({"sentence": text})) # print(tokens) print(line, file=outfile)
def setUp(self): super().setUp() param_file = self.FIXTURES_ROOT / 'simple_tagger' / 'experiment_with_regularization.json' self.set_up_model(param_file, self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv') params = Params.from_file(param_file) self.reader = DatasetReader.from_params(params['dataset_reader']) self.iterator = DataIterator.from_params(params['iterator']) self.trainer = Trainer.from_params( self.model, self.TEST_DIR, self.iterator, self.dataset, None, params.get('trainer') )
def train_subwords(train_path, model_path, model_type, vocab_size, config_path): temp = tempfile.NamedTemporaryFile(mode="w", delete=False) params = Params.from_file(config_path) reader_params = params.pop("reader", default=Params({})) reader = DatasetReader.from_params(reader_params) for text, summary in reader.parse_set(train_path): temp.write(text + "\n") temp.write(summary + "\n") temp.close() if not os.path.exists(model_path): os.makedirs(model_path) cmd = "--input={} --model_prefix={} --vocab_size={} --model_type={}".format( temp.name, os.path.join(model_path, model_type), vocab_size, model_type) sp_trainer.Train(cmd) os.unlink(temp.name)