def from_params(
            cls,  # type: ignore
            params: Params,
            serialization_dir: str,
            recover: bool = False) -> 'GanTestTrainer':
        dataset_reader = DatasetReader.from_params(params.pop("data_reader"))
        data = dataset_reader.read("")

        noise_reader = DatasetReader.from_params(params.pop("noise_reader"))
        noise = noise_reader.read("")

        generator = Model.from_params(params.pop("generator"))
        discriminator = Model.from_params(params.pop("discriminator"))
        iterator = DataIterator.from_params(params.pop("iterator"))
        noise_iterator = DataIterator.from_params(params.pop("noise_iterator"))

        generator_optimizer = Optimizer.from_params(
            [[n, p]
             for n, p in generator.named_parameters() if p.requires_grad],
            params.pop("generator_optimizer"))

        discriminator_optimizer = Optimizer.from_params(
            [[n, p]
             for n, p in discriminator.named_parameters() if p.requires_grad],
            params.pop("discriminator_optimizer"))

        num_epochs = params.pop_int("num_epochs")
        batches_per_epoch = params.pop_int("batches_per_epoch")
        params.pop("trainer")

        params.assert_empty(__name__)

        return cls(serialization_dir, data, noise, generator, discriminator,
                   iterator, noise_iterator, generator_optimizer,
                   discriminator_optimizer, batches_per_epoch, num_epochs)
Exemple #2
0
    def test_archiving(self):
        # copy params, since they'll get consumed during training
        params_copy = self.params.duplicate()
        params_dict_copy = copy.deepcopy(self.params.as_dict())

        # `train_model` should create an archive
        serialization_dir = self.TEST_DIR / "archive_test"
        model = train_model(self.params, serialization_dir=serialization_dir)

        archive_path = serialization_dir / "model.tar.gz"

        # load from the archive
        archive = load_archive(archive_path)
        model2 = archive.model

        assert_models_equal(model, model2)

        assert isinstance(
            archive.dataset_reader,
            type(DatasetReader.from_params(params_copy["dataset_reader"].duplicate())),
        )
        assert isinstance(
            archive.validation_dataset_reader,
            type(DatasetReader.from_params(params_copy["dataset_reader"].duplicate())),
        )  # validation_dataset_reader is not in the config, so fall back to dataset_reader

        # check that params are the same
        params2 = archive.config
        assert params2.as_dict() == params_dict_copy
Exemple #3
0
def datasets_from_params(params: Params) -> Dict[str, Iterable[Instance]]:
    """
    Load all the datasets specified by the config.
    """
    dataset_reader = DatasetReader.from_params(params.pop('dataset_reader'))
    validation_dataset_reader_params = params.pop("validation_dataset_reader", None)

    validation_and_test_dataset_reader: DatasetReader = dataset_reader
    if validation_dataset_reader_params is not None:
        logger.info("Using a separate dataset reader to load validation and test data.")
        validation_and_test_dataset_reader = DatasetReader.from_params(validation_dataset_reader_params)

    train_data_path = params.pop('train_data_path')
    logger.info("Reading training data from %s", train_data_path)
    train_data = dataset_reader.read(train_data_path)

    datasets: Dict[str, Iterable[Instance]] = {"train": train_data}

    validation_data_path = params.pop('validation_data_path', None)
    if validation_data_path is not None:
        logger.info("Reading validation data from %s", validation_data_path)
        validation_data = validation_and_test_dataset_reader.read(validation_data_path)
        datasets["validation"] = validation_data

    test_data_path = params.pop("test_data_path", None)
    if test_data_path is not None:
        logger.info("Reading test data from %s", test_data_path)
        test_data = validation_and_test_dataset_reader.read(test_data_path)
        datasets["test"] = test_data

    return datasets
def read_all_datasets(
    train_data_path: str,
    dataset_reader: DatasetReader,
    validation_dataset_reader: DatasetReader = None,
    validation_data_path: str = None,
    test_data_path: str = None,
) -> Dict[str, Dataset]:
    """
    Reads all datasets (perhaps lazily, if the corresponding dataset readers are lazy) and returns a
    dictionary mapping dataset name ("train", "validation" or "test") to the iterable resulting from
    `reader.read(filename)`.
    """

    logger.info("Reading training data from %s", train_data_path)
    train_data = dataset_reader.read(train_data_path)

    datasets: Dict[str, Dataset] = {"train": train_data}

    validation_dataset_reader = validation_dataset_reader or dataset_reader

    if validation_data_path is not None:
        logger.info("Reading validation data from %s", validation_data_path)
        validation_data = validation_dataset_reader.read(validation_data_path)
        datasets["validation"] = validation_data

    if test_data_path is not None:
        logger.info("Reading test data from %s", test_data_path)
        test_data = validation_dataset_reader.read(test_data_path)
        datasets["test"] = test_data

    return datasets
def datasets_from_params(params: Params) -> Dict[str, Dataset]:
    """
    Load all the datasets specified by the config.

    # Parameters

    params : `Params`
    cache_directory : `str`, optional
        If given, we will instruct the `DatasetReaders` that we construct to cache their
        instances in this location (or read their instances from caches in this location, if a
        suitable cache already exists).  This is essentially a `base` directory for the cache, as
        we will additionally add the `cache_prefix` to this directory, giving an actual cache
        location of `cache_directory + cache_prefix`.
    cache_prefix : `str`, optional
        This works in conjunction with the `cache_directory`.  The idea is that the
        `cache_directory` contains caches for all different parameter settings, while the
        `cache_prefix` captures a specific set of parameters that led to a particular cache file.
        That is, if you change the tokenization settings inside your `DatasetReader`, you don't
        want to read cached data that used the old settings.  In order to avoid this, we compute a
        hash of the parameters used to construct each `DatasetReader` and use that as a "prefix"
        to the cache files inside the base `cache_directory`.  So, a given `input_file` would
        be cached essentially as `cache_directory + cache_prefix + input_file`, where you specify
        a `cache_directory`, the `cache_prefix` is based on the dataset reader parameters, and
        the `input_file` is whatever path you provided to `DatasetReader.read()`.  In order to
        allow you to give recognizable names to these prefixes if you want them, you can manually
        specify the `cache_prefix`.  Note that in some rare cases this can be dangerous, as we'll
        use the `same` prefix for both train and validation dataset readers.
    """
    dataset_reader_params = params.pop("dataset_reader")
    validation_dataset_reader_params = params.pop("validation_dataset_reader", None)

    dataset_reader = DatasetReader.from_params(dataset_reader_params)

    validation_and_test_dataset_reader: DatasetReader = dataset_reader
    if validation_dataset_reader_params is not None:
        logger.info("Using a separate dataset reader to load validation and test data.")
        validation_and_test_dataset_reader = DatasetReader.from_params(
            validation_dataset_reader_params
        )

    train_data_path = params.pop("train_data_path")
    logger.info("Reading training data from %s", train_data_path)
    train_data = dataset_reader.read(train_data_path)

    datasets: Dict[str, Iterable[Instance]] = {"train": train_data}

    validation_data_path = params.pop("validation_data_path", None)
    if validation_data_path is not None:
        logger.info("Reading validation data from %s", validation_data_path)
        validation_data = validation_and_test_dataset_reader.read(validation_data_path)
        datasets["validation"] = validation_data

    test_data_path = params.pop("test_data_path", None)
    if test_data_path is not None:
        logger.info("Reading test data from %s", test_data_path)
        test_data = validation_and_test_dataset_reader.read(test_data_path)
        datasets["test"] = test_data

    return datasets
Exemple #6
0
def datasets_from_params(
    params: Params,
    train: bool = True,
    validation: bool = True,
    test: bool = True,
    serialization_dir: Optional[Union[str, PathLike]] = None,
) -> Dict[str, Union["AllennlpDataset", "AllennlpLazyDataset"]]:
    """
    Load datasets specified by the config.
    """
    datasets: Dict[str, Union["AllennlpDataset", "AllennlpLazyDataset"]] = {}

    train = train and ("train_data_path" in params)
    validation = validation and ("validation_data_path" in params)
    test = test and ("test_data_path" in params)
    if not any((train, validation, test)):
        # Return early so don't unnecessarily initialize the train data reader.
        return datasets

    dataset_reader_params = params.pop("dataset_reader")
    dataset_reader = DatasetReader.from_params(
        dataset_reader_params, serialization_dir=serialization_dir)

    if train:
        train_data_path = params.pop("train_data_path")
        logger.info("Reading training data from %s", train_data_path)
        train_data = dataset_reader.read(train_data_path)
        datasets["train"] = train_data

    if not validation and not test:
        # Return early so we don't unnecessarily initialize the validation/test data
        # reader.
        return datasets

    validation_and_test_dataset_reader: DatasetReader = dataset_reader
    validation_dataset_reader_params = params.pop("validation_dataset_reader",
                                                  None)
    if validation_dataset_reader_params is not None:
        logger.info(
            "Using a separate dataset reader to load validation and test data."
        )
        validation_and_test_dataset_reader = DatasetReader.from_params(
            validation_dataset_reader_params,
            serialization_dir=serialization_dir)

    if validation:
        validation_data_path = params.pop("validation_data_path")
        logger.info("Reading validation data from %s", validation_data_path)
        validation_data = validation_and_test_dataset_reader.read(
            validation_data_path)
        datasets["validation"] = validation_data

    if test:
        test_data_path = params.pop("test_data_path")
        logger.info("Reading test data from %s", test_data_path)
        test_data = validation_and_test_dataset_reader.read(test_data_path)
        datasets["test"] = test_data

    return datasets
Exemple #7
0
def _load_dataset_readers(config):
    dataset_reader_params = config.get("dataset_reader")

    # Try to use the validation dataset reader if there is one - otherwise fall back
    # to the default dataset_reader used for both training and validation.
    validation_dataset_reader_params = config.get(
        "validation_dataset_reader", dataset_reader_params.duplicate())

    dataset_reader = DatasetReader.from_params(dataset_reader_params)
    validation_dataset_reader = DatasetReader.from_params(
        validation_dataset_reader_params)

    return dataset_reader, validation_dataset_reader
Exemple #8
0
def multitask_datasets_from_params(params: Params) -> Dict[str, Iterable[Instance]]:
    """
    Load all the datasets specified by the config.
    """
    # In the multitask setting, the dataset types are indexed by the names. As
    # such, you can use a disjoint set of readers for the train, test, and
    # validation sets (just give them different names)
    readers = {name: DatasetReader.from_params(reader_params)
               for name, reader_params in params.pop("dataset_readers").items()}

    train_data_paths = params.pop('train_data_paths')
    validation_data_paths = params.pop('validation_data_paths', None)
    test_data_paths = params.pop("test_data_paths", None)

    datasets: Dict[str, Iterable[Instance]] = {
        "train": load_datasets(train_data_paths, readers)
    }

    if validation_data_paths is not None:
        datasets["validation"] = load_datasets(validation_data_paths, readers)

    if test_data_paths is not None:
        datasets["test"] = load_datasets(test_data_paths, readers)

    return datasets
def main(train_path, val_path, test_path, config_path, subword_model_path,
         out_dir):
    params = Params.from_file(config_path)
    reader_params = params.pop("reader", default=Params({}))
    reader = DatasetReader.from_params(reader_params)
    processor = SentencePieceProcessor()
    processor.Load(subword_model_path)
    train_text_file = os.path.join(out_dir, "train.text.txt")
    train_summary_file = os.path.join(out_dir, "train.summary.txt")
    val_text_file = os.path.join(out_dir, "val.text.txt")
    val_summary_file = os.path.join(out_dir, "val.summary.txt")
    test_text_file = os.path.join(out_dir, "test.text.txt")
    test_summary_file = os.path.join(out_dir, "test.summary.txt")
    files = ((train_path, train_text_file,
              train_summary_file), (val_path, val_text_file, val_summary_file),
             (test_path, test_text_file, test_summary_file))
    for path, text_file_name, summary_file_name in files:
        with open(text_file_name,
                  "w") as text_file, open(summary_file_name,
                                          "w") as summary_file:
            for text, summary in reader.parse_set(path):
                text_subwords = processor.EncodeAsPieces(text)
                summary_subwords = processor.EncodeAsPieces(summary)
                text_subwords.insert(0, "<t>")
                text_subwords.append("</t>")
                summary_subwords.insert(0, "<t>")
                summary_subwords.append("</t>")
                text_file.write(" ".join(text_subwords) + "\n")
                summary_file.write((" ".join(summary_subwords)) + "\n")
Exemple #10
0
def preprocess(config_path,
               file_path,
               save_path,
               bert_path,
               max_src_tokens,
               max_tgt_tokens,
               lower=False,
               nrows=None):
    bert = BertData(bert_path, lower, max_src_tokens, max_tgt_tokens)
    params = Params.from_file(config_path)
    reader_params = params.pop("reader", default=Params({}))
    reader = DatasetReader.from_params(reader_params)
    data = []
    for i, (text, summary) in enumerate(reader.parse_set(file_path)):
        if nrows is not None and i >= nrows:
            break
        src = [(s.text.lower() if lower else s.text).split()
               for s in sentenize(text)]
        tgt = [(s.text.lower() if lower else s.text).split()
               for s in sentenize(summary)]
        src_indices, tgt_indices, segments_ids, cls_ids, src_txt, tgt_txt = bert.preprocess(
            src, tgt)
        b_data_dict = {
            "src": src_indices,
            "tgt": tgt_indices,
            "segs": segments_ids,
            'clss': cls_ids,
            'src_txt': src_txt,
            "tgt_txt": tgt_txt
        }
        data.append(b_data_dict)
    torch.save(data, save_path)
    def _predict_iter(
        self, data: Union[Iterable[Dict[str, Any]], List[Dict[str, Any]]]
    ) -> Iterable[Dict[str, Any]]:
        '''
        Iterates over the predictions and yields one prediction at a time.
        This is a useful wrapper as it performs the data pre-processing and 
        assertion checks.

        The predictions are predicted in batchs so that the model does not 
        load in lots of data at once and thus have memory issues.

        :param data: Iterable or list of dictionaries that the predictor can 
                     take as input e.g. `target-tagger` predictor expects at 
                     most a `text` key and value.
        :yields: A dictionary containing all the values the model outputs e.g.
                 For the `target_tagger` model it would return `logits`, 
                 `class_probabilities`, `mask`, and `tags`.
        :raises AssertionError: If the `model` attribute is None. This can be 
                                overcome by either fitting or loading a model.
        :raises TypeError: If the data given is not of Type List or Iterable.
        '''
        no_model_error = 'There is no model to make predictions, either fit '\
                         'or load a model to resolve this.'
        assert self.model, no_model_error
        self.model.eval()

        all_model_params = Params.from_file(self._param_fp)

        reader_params = all_model_params.get("dataset_reader")
        dataset_reader = DatasetReader.from_params(reader_params)
        predictor = Predictor.by_name(self._predictor_name)(self.model,
                                                            dataset_reader)

        batch_size = 64
        if 'iterator' in all_model_params:
            iter_params = all_model_params.get("iterator")
            if 'batch_size' in iter_params:
                batch_size = iter_params['batch_size']

        # Data has to be an iterator
        if isinstance(data, list) or isinstance(data, collections.Iterable):
            data = iter(data)
        else:
            raise TypeError(
                f'Data given has to be of type {collections.Iterable}'
                f' and not {type(data)}')
        data_exists = True
        while data_exists:
            data_batch = []
            for _ in range(batch_size):
                try:
                    data_batch.append(next(data))
                except StopIteration:
                    data_exists = False
            if data_batch:
                predictions = predictor.predict_batch_json(data_batch)
                for prediction in predictions:
                    yield prediction
Exemple #12
0
def main(args):
    params = Params.from_file(args.config_path)
    stdout_handler = prepare_global_logging(args.output_dir, False)
    prepare_environment(params)

    reader = DatasetReader.from_params(params["dataset_reader"])
    train_dataset = reader.read(params.pop("train_data_path", None))
    valid_dataset = reader.read(params.pop("validation_data_path", None))
    test_data_path = params.pop("test_data_path", None)
    if test_data_path:
        test_dataset = reader.read(test_data_path)
        vocab = Vocabulary.from_instances(train_dataset + valid_dataset +
                                          test_dataset)
    else:
        test_dataset = None
        vocab = Vocabulary.from_instances(train_dataset + valid_dataset)

    model_params = params.pop("model", None)
    model = Model.from_params(model_params.duplicate(), vocab=vocab)
    vocab.save_to_files(os.path.join(args.output_dir, "vocabulary"))
    # copy config file
    with open(args.config_path, "r", encoding="utf-8") as f_in:
        with open(os.path.join(args.output_dir, "config.json"),
                  "w",
                  encoding="utf-8") as f_out:
            f_out.write(f_in.read())

    iterator = DataIterator.from_params(params.pop("iterator", None))
    iterator.index_with(vocab)

    trainer_params = params.pop("trainer", None)
    trainer = Trainer.from_params(model=model,
                                  serialization_dir=args.output_dir,
                                  iterator=iterator,
                                  train_data=train_dataset,
                                  validation_data=valid_dataset,
                                  params=trainer_params.duplicate())
    trainer.train()

    # evaluate on the test set
    if test_dataset:
        logging.info("Evaluating on the test set")
        import torch  # import here to ensure the republication of the experiment
        model.load_state_dict(
            torch.load(os.path.join(args.output_dir, "best.th")))
        test_metrics = evaluate(model,
                                test_dataset,
                                iterator,
                                cuda_device=trainer_params.pop(
                                    "cuda_device", 0),
                                batch_weight_key=None)
        logging.info(f"Metrics on the test set: {test_metrics}")
        with open(os.path.join(args.output_dir, "test_metrics.txt"),
                  "w",
                  encoding="utf-8") as f_out:
            f_out.write(f"Metrics on the test set: {test_metrics}")

    cleanup_global_logging(stdout_handler)
Exemple #13
0
def data_to_tensors(
    data: TransactionsData, reader: DatasetReader, vocab: Vocabulary, device: Union[torch.device, int] = -1,
) -> ModelsInput:

    instances = Batch([reader.text_to_instance(**data.to_dict())])

    instances.index_instances(vocab)
    inputs = instances.as_tensor_dict()
    return move_to_device(inputs, device)
Exemple #14
0
 def setUp(self):
     super().setUp()
     param_file = self.FIXTURES_ROOT / "simple_tagger" / "experiment_with_regularization.json"
     self.set_up_model(param_file,
                       self.FIXTURES_ROOT / "data" / "sequence_tagging.tsv")
     params = Params.from_file(param_file)
     self.reader = DatasetReader.from_params(params["dataset_reader"])
     self.iterator = DataIterator.from_params(params["iterator"])
     self.trainer = Trainer.from_params(self.model, self.TEST_DIR,
                                        self.iterator, self.dataset, None,
                                        params.get("trainer"))
Exemple #15
0
 def setUp(self):
     super().setUp()
     param_file = self.FIXTURES_ROOT / 'simple_tagger' / 'experiment_with_regularization.json'
     self.set_up_model(param_file,
                       self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv')
     params = Params.from_file(param_file)
     self.reader = DatasetReader.from_params(params['dataset_reader'])
     self.iterator = DataIterator.from_params(params['iterator'])
     self.trainer = Trainer.from_params(self.model, self.TEST_DIR,
                                        self.iterator, self.dataset, None,
                                        params.get('trainer'))
Exemple #16
0
def datasets_from_params(params: Params) -> Dict[str, Iterable[Instance]]:
    # copied from allennlp.training.util.datasets_from_params

    dataset_reader = DatasetReader.from_params(params.pop('dataset_reader'))
    validation_dataset_reader_params = params.pop("validation_dataset_reader",
                                                  None)

    validation_and_test_dataset_reader: DatasetReader = dataset_reader
    if validation_dataset_reader_params is not None:
        logger.info(
            "Using a separate dataset reader to load validation and test data."
        )
        validation_and_test_dataset_reader = DatasetReader.from_params(
            validation_dataset_reader_params)

    train_data_path = params.pop('train_data_path')
    logger.info("Reading training data from %s", train_data_path)
    train_data = dataset_reader.read(train_data_path)

    datasets: Dict[str, Iterable[Instance]] = {"train": train_data}

    validation_data_path = params.pop('validation_data_path', None)
    if validation_data_path is not None:
        logger.info("Reading validation data from %s", validation_data_path)
        validation_data = validation_and_test_dataset_reader.read(
            validation_data_path)
        datasets["validation"] = validation_data

    test_data_path = params.pop("test_data_path", None)
    if test_data_path is not None:
        logger.info("Reading test data from %s", test_data_path)
        test_data = validation_and_test_dataset_reader.read(test_data_path)
        datasets["test"] = test_data

    train_low_data_path = params["trainer"].pop('low_data_path')
    logger.info("Reading training (low) data from %s", train_low_data_path)
    train_low_data = dataset_reader.read(train_low_data_path)

    datasets["train_low"] = train_low_data

    return datasets
Exemple #17
0
    def from_partial_objects(
        cls,
        serialization_dir: str,
        data_reader: DatasetReader,
        noise_reader: DatasetReader,
        generator: Model,
        discriminator: Model,
        iterator: DataIterator,
        noise_iterator: DataIterator,
        generator_optimizer: Lazy[Optimizer],
        discriminator_optimizer: Lazy[Optimizer],
        num_epochs: int,
        batches_per_epoch: int,
    ) -> "GanTestTrainer":
        data = data_reader.read("")
        noise = noise_reader.read("")

        generator_params = [[n, p] for n, p in generator.named_parameters()
                            if p.requires_grad]
        generator_optimizer_ = generator_optimizer.construct(
            model_parameters=generator_params)

        discriminator_params = [[n, p]
                                for n, p in discriminator.named_parameters()
                                if p.requires_grad]
        discriminator_optimizer_ = discriminator_optimizer.construct(
            model_parameters=discriminator_params)

        return cls(
            serialization_dir,
            data,
            noise,
            generator,
            discriminator,
            iterator,
            noise_iterator,
            generator_optimizer_,
            discriminator_optimizer_,
            batches_per_epoch,
            num_epochs,
        )
def train_model(
        train_fp: Path,
        dev_fp: Path,
        model_fp: Path,
        vocab_data_fps: Optional[List[Path]] = None) -> Tuple[Model, Params]:
    '''
    :param train_fp: The Traning dataset file path
    :param dev_fp: The development dataset file path
    :param model_fp: The json file that describes the model
    :param vocab_data_fps: An optional List of additional dataset files that 
                           will be used to create the models vocab
    :returns: A tuple containing the Trained model and an object that 
              describes the model.
    '''
    set_random_env()
    model_params = Params.from_file(model_fp)
    emotion_dataset_reader = DatasetReader.from_params(
        model_params.pop('dataset_reader'))

    # Data
    train_dataset = emotion_dataset_reader.read(cached_path(str(train_fp)))
    dev_dataset = emotion_dataset_reader.read(cached_path(str(dev_fp)))
    vocab_datasets = [train_dataset, dev_dataset]
    if vocab_data_fps:
        for vocab_data_fp in vocab_data_fps:
            vocab_datasets.append(
                emotion_dataset_reader.read(cached_path(str(vocab_data_fp))))
    vocab_data = []
    for vocab_dataset in vocab_datasets:
        vocab_data.extend(vocab_dataset)
    vocab = Vocabulary.from_instances(vocab_data)
    emotion_model = Model.from_params(vocab=vocab,
                                      params=model_params.pop('model'))
    data_iter = DataIterator.from_params(model_params.pop('iterator'))
    data_iter.index_with(vocab)
    # Trainer
    with tempfile.TemporaryDirectory() as serial_dir:
        trainer_params = model_params.pop('trainer')
        trainer = Trainer.from_params(model=emotion_model,
                                      serialization_dir=serial_dir,
                                      iterator=data_iter,
                                      train_data=train_dataset,
                                      validation_data=dev_dataset,
                                      params=trainer_params)
        _ = trainer.train()

        temp_config_fp = str(Path(serial_dir, CONFIG_NAME).resolve())
        Params.from_file(model_fp).to_file(temp_config_fp)
        vocab.save_to_files(Path(serial_dir, "vocabulary").resolve())
        archive_model(serial_dir,
                      files_to_archive=model_params.files_to_archive)
        model_archive = load_archive(serial_dir, cuda_device=0)
        return model_archive.model, model_archive.config
Exemple #19
0
    def test_multi_iterator(self):
        params, file_paths = get_dataset_params_paths(['ner', 'ccg'])

        multitask_reader = DatasetReader.from_params(params)
        dataset = multitask_reader.read(file_paths)

        iterator_params = Params({
            "type": "multitask_iterator",
            "iterators": {
                "ner": {
                    "type": "bucket",
                    "sorting_keys": [["tokens", "num_tokens"]],
                    "padding_noise": 0.0,
                    "batch_size": 2
                },
                "ccg": {
                    "type": "basic",
                    "batch_size": 1
                }
            },
            "names_to_index": ["ner", "ccg"],
        })

        multi_iterator = DataIterator.from_params(iterator_params)

        # make the vocab
        vocab = Vocabulary.from_params(Params({}),
                                       (instance for instance in dataset))
        multi_iterator.index_with(vocab)

        all_batches = []
        for epoch in range(2):
            all_batches.append([])
            for batch in multi_iterator(dataset, shuffle=True, num_epochs=1):
                all_batches[-1].append(batch)

        # 3 batches per epoch -
        self.assertEqual([len(b) for b in all_batches], [3, 3])

        ner_batches = []
        ccg_batches = []
        for epoch_batches in all_batches:
            ner_batches.append(0)
            ccg_batches.append(0)
            for batch in epoch_batches:
                if 'original_pos_tags' not in batch:
                    ner_batches[-1] += 1
                if 'original_pos_tags' in batch:
                    ccg_batches[-1] += 1

        # 1 NER batch per epoch, 2 CCG per epoch
        self.assertEqual(ner_batches, [1, 1])
        self.assertEqual(ccg_batches, [2, 2])
Exemple #20
0
 def __init__(self, archive_path: str, device: int = -1, batch_size: int = 32):
     archive_path = Path(archive_path)
     archive = load_archive(archive_path)
     self.params = archive.config
     self.model = archive.model.eval()
     self.batch_size = batch_size
     self.reader = DatasetReader.from_params(self.params.get("dataset_reader"))
     self.vocab = self._load_vocab(archive_path)
     self.idx2label = self.vocab.get_index_to_token_vocabulary('labels')
     if device != -1:
         self.model.to(f"cuda:{device}")
     super(AllenNLPLimePredictor, self).__init__(self.idx2label)
Exemple #21
0
def run_config(config):
    params = Params(json.loads(config))
    params_copy = params.duplicate()

    if "dataset_reader" in params:
        reader = DatasetReader.from_params(params.pop("dataset_reader"))
    else:
        raise RuntimeError("`dataset_reader` section is required")

    loader_params = params.pop("iterator")
    train_data_loader = DataIterator.from_params(
        reader=reader,
        data_path=params.pop("train_data_path"),
        params=loader_params.duplicate(),
    )
    dev_data_loader = DataIterator.from_params(
        reader=reader,
        data_path=params.pop("validation_data_path"),
        params=loader_params,
    )

    print("Building the vocabulary...")
    vocab = Vocabulary.from_instances(train_data_loader.iter_instances())

    if "model" not in params:
        # 'dataset' mode — just preview the (first 10) instances
        print("Showing the first 10 instances:")
        for inst in train_data_loader.iter_instances():
            print(inst)
            return None

    model = Model.from_params(vocab=vocab, params=params.pop("model"))

    train_data_loader.index_with(vocab)
    dev_data_loader.index_with(vocab)

    # set up a temporary, empty directory for serialization
    with tempfile.TemporaryDirectory() as serialization_dir:
        trainer = Trainer.from_params(
            model=model,
            serialization_dir=serialization_dir,
            data_loader=train_data_loader,
            validation_data_loader=dev_data_loader,
            params=params.pop("trainer"),
        )
        trainer.train()

    return {
        "params": params_copy,
        "dataset_reader": reader,
        "vocab": vocab,
        "model": model,
    }
Exemple #22
0
 def from_dataset_reader(
     cls,
     reader: DatasetReader,
     data_path: str,
     batch_size: int,
     shuffle: bool = False,
     batches_per_epoch: Optional[int] = None,
 ) -> "SimpleDataLoader":
     instances = list(reader.read(data_path))
     return cls(instances,
                batch_size,
                shuffle=shuffle,
                batches_per_epoch=batches_per_epoch)
def main(train_path,
         val_path,
         test_path,
         config_path,
         subword_model_path,
         out_dir,
         max_text_subwords,
         max_summary_subwords,
         source_suffix,
         target_suffix,
         insert_tags=False,
         lowercase=False):
    params = Params.from_file(config_path)
    reader_params = params.pop("dataset_reader", default=Params({}))
    reader = DatasetReader.from_params(reader_params)

    processor = SentencePieceProcessor()
    processor.Load(subword_model_path)

    train_text_file = os.path.join(out_dir, "train.{}".format(source_suffix))
    train_summary_file = os.path.join(out_dir,
                                      "train.{}".format(target_suffix))
    val_text_file = os.path.join(out_dir, "val.{}".format(source_suffix))
    val_summary_file = os.path.join(out_dir, "val.{}".format(target_suffix))
    test_text_file = os.path.join(out_dir, "test.{}".format(source_suffix))
    test_summary_file = os.path.join(out_dir, "test.{}".format(target_suffix))

    files = ((train_path, train_text_file,
              train_summary_file), (val_path, val_text_file, val_summary_file),
             (test_path, test_text_file, test_summary_file))
    for path, text_file_name, summary_file_name in files:
        with open(text_file_name,
                  "w") as text_file, open(summary_file_name,
                                          "w") as summary_file:
            for text, summary in reader.parse_set(path):
                if lowercase:
                    text = text.lower()
                    summary = summary.lower()
                text_subwords = processor.EncodeAsPieces(text)
                if max_text_subwords:
                    text_subwords = text_subwords[:max_text_subwords]
                summary_subwords = processor.EncodeAsPieces(summary)
                if max_summary_subwords:
                    summary_subwords = summary_subwords[:max_summary_subwords]
                if insert_tags:
                    text_subwords.insert(0, "<t>")
                    text_subwords.append("</t>")
                    summary_subwords.insert(0, "<t>")
                    summary_subwords.append("</t>")
                text_file.write(" ".join(text_subwords) + "\n")
                summary_file.write((" ".join(summary_subwords)) + "\n")
Exemple #24
0
 def load_predictor(path):
     if path.endswith(".tar.gz"):
         return Predictor.from_path(path, "ja_seq2seq")
     elif path.endswith(".th"):
         serialization_dir = str(Path(path).parent)
         params = Params.from_file(str(serialization_dir + "/config.json"))
         model = Model.load(params,
                            str(serialization_dir),
                            weights_file=path)
         dataset_reader = DatasetReader.from_params(
             params.get("dataset_reader"))
         return JaSeq2SeqPredictor(model, dataset_reader)
     else:
         raise ValueError
Exemple #25
0
def setup_datasets(params: Params) -> Dict[str, Iterable[Instance]]:
    dataset_reader_params = params.get('dataset_reader')
    validation_dataset_reader_params = params.get('validation_dataset_reader', None)
    dataset_reader = DatasetReader.from_params(dataset_reader_params)

    validation_and_test_dataset_reader: DatasetReader = dataset_reader
    if validation_dataset_reader_params is not None:
        validation_and_test_dataset_reader = DatasetReader.from_params(validation_dataset_reader_params)

    train_data_path = params.get('train_data_path')
    train_data = dataset_reader.read(train_data_path)

    datasets: Dict[str, Iterable[Instance]] = {"train": train_data}

    validation_data_path = params.get('validation_data_path', None)
    if validation_data_path is not None:
        validation_data = validation_and_test_dataset_reader.read(validation_data_path)
        datasets["validation"] = validation_data

    test_data_path = params.get("test_data_path", None)
    if test_data_path is not None:
        test_data = validation_and_test_dataset_reader.read(test_data_path)
        datasets["test"] = test_data
    return datasets
Exemple #26
0
 def setUp(self):
     super().setUp()
     param_file = self.FIXTURES_ROOT / "simple_tagger" / "experiment_with_regularization.json"
     self.set_up_model(param_file,
                       self.FIXTURES_ROOT / "data" / "sequence_tagging.tsv")
     params = Params.from_file(param_file)
     self.reader = DatasetReader.from_params(params["dataset_reader"])
     self.data_loader = DataLoader.from_params(dataset=self.instances,
                                               params=params["data_loader"])
     self.trainer = Trainer.from_params(
         model=self.model,
         data_loader=self.data_loader,
         serialization_dir=self.TEST_DIR,
         params=params.get("trainer"),
     )
Exemple #27
0
    def from_config(cls,
                    config: Params,
                    predictor_name: str = None) -> 'Predictor':
        dataset_reader_params = config["dataset_reader"]
        dataset_reader = DatasetReader.from_params(dataset_reader_params)

        tokenizer = dataset_reader._tokenizer or WordTokenizer()  # pylint: disable=protected-access
        token_indexers = dataset_reader._token_indexers  # pylint: disable=protected-access

        model_name = config.get("model").get("type")
        model = Model.load(config)
        model.eval()

        predictor_name = predictor_name or DEFAULT_PREDICTORS[model_name]
        return Predictor.by_name(predictor_name)(model, tokenizer,
                                                 token_indexers)
Exemple #28
0
def main(args):
    config = Params.from_file(args.config_file)
    reader_config = config.pop("dataset_reader")
    reader = DatasetReader.from_params(reader_config)
    x = reader.read(args.input_file)
    if args.output_file is None:
        output_file = sys.stdout
    else:
        output_file = open(args.output_file, mode='w')
    for instance in x:
        tokens = instance.fields['tokens'].tokens
        tags = instance.fields['tags'].labels
        text = [t.text for t in tokens]
        for token, tag in zip(text, tags):
            print(f'{token}\t{tag}', file=output_file)
        print('', file=output_file)
def main(args):
    config = Params.from_file(args.config_file)
    reader_config = config.pop("dataset_reader")
    reader = DatasetReader.from_params(reader_config)
    x = reader.read(args.input_file)
    if args.output_file is None:
        outfile = sys.stdout
    else:
        outfile = open(args.output_file, mode='w')
    # print(len(x.instances))
    for instance in x:
        tokens = instance.fields['tokens'].tokens
        text = ' '.join([t.text for t in tokens])
        line = str(json.dumps({"sentence": text}))
        # print(tokens)
        print(line, file=outfile)
 def setUp(self):
     super().setUp()
     param_file = self.FIXTURES_ROOT / 'simple_tagger' / 'experiment_with_regularization.json'
     self.set_up_model(param_file,
                       self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv')
     params = Params.from_file(param_file)
     self.reader = DatasetReader.from_params(params['dataset_reader'])
     self.iterator = DataIterator.from_params(params['iterator'])
     self.trainer = Trainer.from_params(
             self.model,
             self.TEST_DIR,
             self.iterator,
             self.dataset,
             None,
             params.get('trainer')
     )
def train_subwords(train_path, model_path, model_type, vocab_size,
                   config_path):
    temp = tempfile.NamedTemporaryFile(mode="w", delete=False)
    params = Params.from_file(config_path)
    reader_params = params.pop("reader", default=Params({}))
    reader = DatasetReader.from_params(reader_params)
    for text, summary in reader.parse_set(train_path):
        temp.write(text + "\n")
        temp.write(summary + "\n")
    temp.close()
    if not os.path.exists(model_path):
        os.makedirs(model_path)
    cmd = "--input={} --model_prefix={} --vocab_size={} --model_type={}".format(
        temp.name, os.path.join(model_path, model_type), vocab_size,
        model_type)
    sp_trainer.Train(cmd)
    os.unlink(temp.name)