Ejemplo n.º 1
0
 def test_registry_has_builtin_dataset_readers(self):
     assert DatasetReader.by_name(u'snli').__name__ == u'SnliReader'
     assert DatasetReader.by_name(
         u'sequence_tagging').__name__ == u'SequenceTaggingDatasetReader'
     assert DatasetReader.by_name(
         u'language_modeling').__name__ == u'LanguageModelingReader'
     assert DatasetReader.by_name(u'squad').__name__ == u'SquadReader'
Ejemplo n.º 2
0
def datasets_from_params(params: Params) -> Dict[str, Iterable[Instance]]:
    """
    Load all the datasets specified by the config.
    """
    dataset_reader = DatasetReader.from_params(params.pop('dataset_reader'))
    validation_dataset_reader_params = params.pop("validation_dataset_reader", None)

    validation_and_test_dataset_reader: DatasetReader = dataset_reader
    if validation_dataset_reader_params is not None:
        logger.info("Using a separate dataset reader to load validation and test data.")
        validation_and_test_dataset_reader = DatasetReader.from_params(validation_dataset_reader_params)

    train_data_path = params.pop('train_data_path')
    logger.info("Reading training data from %s", train_data_path)
    train_data = dataset_reader.read(train_data_path)

    datasets: Dict[str, Iterable[Instance]] = {"train": train_data}

    validation_data_path = params.pop('validation_data_path', None)
    if validation_data_path is not None:
        logger.info("Reading validation data from %s", validation_data_path)
        validation_data = validation_and_test_dataset_reader.read(validation_data_path)
        datasets["validation"] = validation_data

    test_data_path = params.pop("test_data_path", None)
    if test_data_path is not None:
        logger.info("Reading test data from %s", test_data_path)
        test_data = validation_and_test_dataset_reader.read(test_data_path)
        datasets["test"] = test_data

    return datasets
Ejemplo n.º 3
0
def evaluate_from_args(args: argparse.Namespace) -> Dict[str, Any]:
    # Disable some of the more verbose logging statements
    logging.getLogger("allennlp.common.params").disabled = True
    logging.getLogger("allennlp.nn.initializers").disabled = True
    logging.getLogger("allennlp.modules.token_embedders.embedding").setLevel(
        logging.INFO)

    # Load from archive
    archive = load_archive(
        args.archive_file,
        weights_file=args.weights_file,
        cuda_device=args.cuda_device,
        overrides=args.overrides,
    )
    config = archive.config
    prepare_environment(config)
    model = archive.model
    model.eval()

    # Load the evaluation data

    # Try to use the validation dataset reader if there is one - otherwise fall back
    # to the default dataset_reader used for both training and validation.
    validation_dataset_reader_params = config.pop("validation_dataset_reader",
                                                  None)
    if validation_dataset_reader_params is not None:
        dataset_reader = DatasetReader.from_params(
            validation_dataset_reader_params)
    else:
        dataset_reader = DatasetReader.from_params(
            config.pop("dataset_reader"))
    evaluation_data_path = args.input_file
    logger.info("Reading evaluation data from %s", evaluation_data_path)
    instances = dataset_reader.read(evaluation_data_path)

    embedding_sources = (json.loads(args.embedding_sources_mapping)
                         if args.embedding_sources_mapping else {})

    if args.extend_vocab:
        logger.info("Vocabulary is being extended with test instances.")
        model.vocab.extend_from_instances(instances=instances)
        model.extend_embedder_vocab(embedding_sources)

    instances.index_with(model.vocab)
    data_loader_params = config.pop("validation_data_loader", None)
    if data_loader_params is None:
        data_loader_params = config.pop("data_loader")
    if args.batch_size:
        data_loader_params["batch_size"] = args.batch_size
    data_loader = DataLoader.from_params(dataset=instances,
                                         params=data_loader_params)

    metrics = evaluate(model, data_loader, args.cuda_device,
                       args.batch_weight_key)

    logger.info("Finished evaluating.")

    dump_metrics(args.output_file, metrics, log=True)

    return metrics
Ejemplo n.º 4
0
def datasets_from_params(params: Params) -> Dict[str, Iterable[Instance]]:
    """
    Load all the datasets specified by the config.
    """
    dataset_reader = DatasetReader.from_params(params.pop('dataset_reader'))
    validation_dataset_reader_params = params.pop("validation_dataset_reader", None)

    validation_and_test_dataset_reader: DatasetReader = dataset_reader
    if validation_dataset_reader_params is not None:
        logger.info("Using a separate dataset reader to load validation and test data.")
        validation_and_test_dataset_reader = DatasetReader.from_params(validation_dataset_reader_params)

    train_data_path = params.pop('train_data_path')
    logger.info("Reading training data from %s", train_data_path)
    train_data = dataset_reader.read(train_data_path)

    datasets: Dict[str, Iterable[Instance]] = {"train": train_data}

    validation_data_path = params.pop('validation_data_path', None)
    if validation_data_path is not None:
        logger.info("Reading validation data from %s", validation_data_path)
        validation_data = validation_and_test_dataset_reader.read(validation_data_path)
        datasets["validation"] = validation_data

    test_data_path = params.pop("test_data_path", None)
    if test_data_path is not None:
        logger.info("Reading test data from %s", test_data_path)
        test_data = validation_and_test_dataset_reader.read(test_data_path)
        datasets["test"] = test_data

    return datasets
Ejemplo n.º 5
0
def evaluate_from_args(args: argparse.Namespace) -> Dict[str, Any]:
    # Disable some of the more verbose logging statements

    logging.getLogger('allennlp.common.params').disabled = True
    logging.getLogger('allennlp.nn.initializers').disabled = True
    logging.getLogger('allennlp.modules.token_embedders.embedding').setLevel(logging.INFO)

    # Load from archive
    archive = load_archive(args.archive_file, args.cuda_device, args.overrides, args.weights_file)
    config = archive.config
    prepare_environment(config)
    model = archive.model
    model.eval()

    # Load the evaluation data

    # Try to use the validation dataset reader if there is one - otherwise fall back
    # to the default dataset_reader used for both training and validation.
    validation_dataset_reader_params = config.pop('validation_dataset_reader', None)
    if validation_dataset_reader_params is not None:
        dataset_reader = DatasetReader.from_params(validation_dataset_reader_params)
    else:
        dataset_reader = DatasetReader.from_params(config.pop('dataset_reader'))
    evaluation_data_path = args.input_file
    logger.info("Reading evaluation data from %s", evaluation_data_path)
    instances = dataset_reader.read(evaluation_data_path)

    iterator_params = config.pop("validation_iterator", None)
    if iterator_params is None:
        iterator_params = config.pop("iterator")
    iterator = DataIterator.from_params(iterator_params)
    iterator.index_with(model.vocab)
Ejemplo n.º 6
0
 def test_registry_has_builtin_dataset_readers(self):
     assert DatasetReader.by_name("snli").__name__ == "SnliReader"
     assert DatasetReader.by_name(
         "sequence_tagging").__name__ == "SequenceTaggingDatasetReader"
     assert DatasetReader.by_name(
         "language_modeling").__name__ == "LanguageModelingReader"
     assert DatasetReader.by_name("squad").__name__ == "SquadReader"
Ejemplo n.º 7
0
def datasets_from_params(params        )                                 :
    u"""
    Load all the datasets specified by the config.
    """
    dataset_reader = DatasetReader.from_params(params.pop(u'dataset_reader'))
    validation_dataset_reader_params = params.pop(u"validation_dataset_reader", None)

    validation_and_test_dataset_reader                = dataset_reader
    if validation_dataset_reader_params is not None:
        logger.info(u"Using a separate dataset reader to load validation and test data.")
        validation_and_test_dataset_reader = DatasetReader.from_params(validation_dataset_reader_params)

    train_data_path = params.pop(u'train_data_path')
    logger.info(u"Reading training data from %s", train_data_path)
    train_data = dataset_reader.read(train_data_path)

    datasets                                = {u"train": train_data}

    validation_data_path = params.pop(u'validation_data_path', None)
    if validation_data_path is not None:
        logger.info(u"Reading validation data from %s", validation_data_path)
        validation_data = validation_and_test_dataset_reader.read(validation_data_path)
        datasets[u"validation"] = validation_data

    test_data_path = params.pop(u"test_data_path", None)
    if test_data_path is not None:
        logger.info(u"Reading test data from %s", test_data_path)
        test_data = validation_and_test_dataset_reader.read(test_data_path)
        datasets[u"test"] = test_data

    return datasets
Ejemplo n.º 8
0
def evaluate_from_args(args: argparse.Namespace) -> Dict[str, Any]:
    # Disable some of the more verbose logging statements
    logging.getLogger('allennlp.common.params').disabled = True
    logging.getLogger('allennlp.nn.initializers').disabled = True
    logging.getLogger('allennlp.modules.token_embedders.embedding').setLevel(
        logging.INFO)

    # Load from archive
    archive = load_archive(args.archive_file, args.cuda_device, args.overrides,
                           args.weights_file)
    config = archive.config
    prepare_environment(config)
    model = archive.model
    model.eval()

    # Load the evaluation data

    # Try to use the validation dataset reader if there is one - otherwise fall back
    # to the default dataset_reader used for both training and validation.
    validation_dataset_reader_params = config.pop('validation_dataset_reader',
                                                  None)
    if validation_dataset_reader_params is not None:
        dataset_reader = DatasetReader.from_params(
            validation_dataset_reader_params)
    else:
        dataset_reader = DatasetReader.from_params(
            config.pop('dataset_reader'))
    evaluation_data_path = args.input_file
    logger.info("Reading evaluation data from %s", evaluation_data_path)
    instances = dataset_reader.read(evaluation_data_path)

    embedding_sources: Dict[str,
                            str] = (json.loads(args.embedding_sources_mapping)
                                    if args.embedding_sources_mapping else {})
    if args.extend_vocab:
        logger.info("Vocabulary is being extended with test instances.")
        model.vocab.extend_from_instances(Params({}), instances=instances)
        model.extend_embedder_vocab(embedding_sources)

    iterator_params = config.pop("validation_iterator", None)
    if iterator_params is None:
        iterator_params = config.pop("iterator")
    iterator = DataIterator.from_params(iterator_params)
    iterator.index_with(model.vocab)

    metrics = evaluate(model, instances, iterator, args.cuda_device,
                       args.batch_weight_key)

    logger.info("Finished evaluating.")
    logger.info("Metrics:")
    for key, metric in metrics.items():
        logger.info("%s: %s", key, metric)

    output_file = args.output_file
    if output_file:
        with open(output_file, "w") as file:
            json.dump(metrics, file, indent=4)
    return metrics
Ejemplo n.º 9
0
 def test_registry_has_builtin_dataset_readers(self):
     assert DatasetReader.by_name('snli').__name__ == 'SnliReader'
     assert DatasetReader.by_name(
         'sequence_tagging').__name__ == 'SequenceTaggingDatasetReader'
     assert DatasetReader.by_name(
         'language_modeling').__name__ == 'LanguageModelingReader'
     assert DatasetReader.by_name(
         'squad_sentence_selection'
     ).__name__ == 'SquadSentenceSelectionReader'
Ejemplo n.º 10
0
def evaluate_from_args(args: argparse.Namespace) -> Dict[str, Any]:
    # Disable some of the more verbose logging statements
    logging.getLogger('allennlp.common.params').disabled = True
    logging.getLogger('allennlp.nn.initializers').disabled = True
    logging.getLogger('allennlp.modules.token_embedders.embedding').setLevel(
        logging.INFO)

    # Load model from archive
    model_archive = load_archive(args.model_archive_file, args.cuda_device,
                                 args.overrides, args.weights_file)
    config = model_archive.config
    prepare_environment(config)
    model = model_archive.model
    model.eval()

    # Load sampler
    sampler_archive = load_archive(args.sampler_archive_file, args.cuda_device,
                                   args.overrides, args.weights_file)
    sampler = sampler_archive.model
    sampler.eval()

    # Load the evaluation data. NOTE: We are using the model's reader!
    validation_dataset_reader_params = config.pop('validation_dataset_reader',
                                                  None)
    if validation_dataset_reader_params is not None:
        dataset_reader = DatasetReader.from_params(
            validation_dataset_reader_params)
    else:
        dataset_reader = DatasetReader.from_params(
            config.pop('dataset_reader'))
    evaluation_data_path = args.input_file
    logger.info('Reading evaluation data from: %s', evaluation_data_path)
    instances = dataset_reader.read(evaluation_data_path)

    # To avoid hairy issues with splitting, we opt to use a basic iterator so that we can
    # generate samples for entire sequences.
    iterator_params = config.pop('iterator', 'None')
    iterator = DataIterator.from_params(iterator_params)
    iterator.index_with(model.vocab)
    # iterator.eval()
    metrics = evaluate_perplexity(model, sampler, args.num_samples, instances,
                                  iterator, args.cuda_device)

    logger.info('Finished evaluating.')
    logger.info('Metrics:')
    for key, metric in metrics.items():
        logger.info('%s: %s', key, metric)

    output_file = args.output_file
    if output_file:
        with open(output_file, 'w') as f:
            json.dump(metrics, f, indent=4)
    return metrics
Ejemplo n.º 11
0
 def __init__(
     self,
     token_indexers: Dict[str, TokenIndexer] = None,
     domain_identifier: str = None,
     model_name: str = None,
     **kwargs,
 ) -> None:
     DatasetReader.__init__(self, **kwargs)
     self._token_indexers = token_indexers or {
         "tokens": PretrainedTransformerIndexer(model_name)
     }
     self._domain_identifier = domain_identifier
     self.tokenizer = AutoTokenizer.from_pretrained(model_name)
     self.lowercase_input = "uncased" in model_name
def datasets_from_params(params: Params) -> Dict[str, Iterable[Instance]]:
    """
    Load all the datasets specified by the config.
    """
    fully_labelled_threshold = 3000 if 'fully_labelled_threshold' not in params[
        'dataset_reader'] else params['dataset_reader'][
            'fully_labelled_threshold']
    dataset_reader = DatasetReader.from_params(
        params.pop("dataset_reader", None))
    validation_dataset_reader_params = params.pop("validation_dataset_reader",
                                                  None)

    validation_and_test_dataset_reader: DatasetReader = dataset_reader
    if validation_dataset_reader_params is not None:
        logger.info(
            "Using a separate dataset reader to load validation and test data."
        )
        validation_and_test_dataset_reader = DatasetReader.from_params(
            validation_dataset_reader_params)

    # Split train data into held out/not held out, initializing to 10% non-held-out
    # non-held-out training data will have 100% of labels (using dataset_reader)
    # held-out training data will have only 50% of labels (using held_out_dataset_reader)
    train_data_path = params.pop('train_data_path')
    logger.info("Reading training data from %s", train_data_path)
    train_data = dataset_reader.read(train_data_path)

    held_out_train_data = train_data[
        fully_labelled_threshold:]  # after threshold
    train_data = train_data[:fully_labelled_threshold]  # before threshold

    datasets: Dict[str, Iterable[Instance]] = {
        "train": train_data,
        "held_out_train": held_out_train_data
    }

    validation_data_path = params.pop('validation_data_path', None)
    if validation_data_path is not None:
        logger.info("Reading validation data from %s", validation_data_path)
        validation_data = validation_and_test_dataset_reader.read(
            validation_data_path)
        datasets["validation"] = validation_data

    test_data_path = params.pop("test_data_path", None)
    if test_data_path is not None:
        logger.info("Reading test data from %s", test_data_path)
        test_data = validation_and_test_dataset_reader.read(test_data_path)
        datasets["test"] = test_data

    return datasets
Ejemplo n.º 13
0
def evaluate_from_args(args):
    # Disable some of the more verbose logging statements
    logging.getLogger(u'allennlp.common.params').disabled = True
    logging.getLogger(u'allennlp.nn.initializers').disabled = True
    logging.getLogger(u'allennlp.modules.token_embedders.embedding').setLevel(
        logging.INFO)

    # Load from archive
    archive = load_archive(args.archive_file, args.cuda_device, args.overrides,
                           args.weights_file)
    config = archive.config
    prepare_environment(config)
    model = archive.model
    model.eval()

    # Load the evaluation data

    # Try to use the validation dataset reader if there is one - otherwise fall back
    # to the default dataset_reader used for both training and validation.
    validation_dataset_reader_params = config.pop(u'validation_dataset_reader',
                                                  None)
    if validation_dataset_reader_params is not None:
        dataset_reader = DatasetReader.from_params(
            validation_dataset_reader_params)
    else:
        dataset_reader = DatasetReader.from_params(
            config.pop(u'dataset_reader'))
    evaluation_data_path = args.input_file
    logger.info(u"Reading evaluation data from %s", evaluation_data_path)
    instances = dataset_reader.read(evaluation_data_path)

    iterator_params = config.pop(u"validation_iterator", None)
    if iterator_params is None:
        iterator_params = config.pop(u"iterator")
    iterator = DataIterator.from_params(iterator_params)
    iterator.index_with(model.vocab)

    metrics = evaluate(model, instances, iterator, args.cuda_device)

    logger.info(u"Finished evaluating.")
    logger.info(u"Metrics:")
    for key, metric in list(metrics.items()):
        logger.info(u"%s: %s", key, metric)

    output_file = args.output_file
    if output_file:
        with open(output_file, u"w") as file:
            json.dump(metrics, file, indent=4)
    return metrics
Ejemplo n.º 14
0
def evaluate_from_args(args: argparse.Namespace) -> Dict[str, Any]:
    # Disable some of the more verbose logging statements
    logging.getLogger('allennlp.common.params').disabled = True
    logging.getLogger('allennlp.nn.initializers').disabled = True
    logging.getLogger('allennlp.modules.token_embedders.embedding').setLevel(
        logging.INFO)

    # Load from archive
    archive = load_archive(args.archive_file, args.cuda_device, args.overrides,
                           args.weights_file)
    config = archive.config
    prepare_environment(config)
    model = archive.model
    model.eval()

    # Set the model to error analysis mode
    model.error_analysis = True

    # Load the evaluation data

    # Try to use the validation dataset reader if there is one - otherwise fall back
    # to the default dataset_reader used for both training and validation.
    validation_dataset_reader_params = config.pop('validation_dataset_reader',
                                                  None)
    if validation_dataset_reader_params is not None:
        dataset_reader = DatasetReader.from_params(
            validation_dataset_reader_params)
    else:
        dataset_reader = DatasetReader.from_params(
            config.pop('dataset_reader'))
    evaluation_data_path = args.evaluation_data_file
    logger.info("Reading evaluation data from %s", evaluation_data_path)
    instances = dataset_reader.read(evaluation_data_path)

    iterator = DataIterator.from_params(config.pop("iterator"))
    iterator.index_with(model.vocab)

    metrics = evaluate(model, instances, iterator, args.cuda_device)

    logger.info("Finished evaluating.")
    print("All Metrics")
    print("=" * 79)
    for key, metric in metrics.items():
        print("{}\t{}".format(key, metric))

    # Turn off error analysis mode
    model.error_analysis = False
    return metrics
Ejemplo n.º 15
0
    def _evaluate_nn(self, model_path: str, evaluation_data_file: str,
                     cuda_device: int):
        """

        :param model_path:
        :param evaluation_data_file:
        :param cuda_device:
        :return:
        """
        # import allennlp ontoemma classes (to register -- necessary, do not remove)
        from emma.allennlp_classes.ontoemma_dataset_reader import OntologyMatchingDatasetReader
        from emma.allennlp_classes.ontoemma_model import OntoEmmaNN

        # Load from archive
        archive = load_archive(model_path, cuda_device)
        config = archive.config
        prepare_environment(config)
        model = archive.model
        model.eval()

        # Load the evaluation data
        dataset_reader = DatasetReader.from_params(
            config.pop('dataset_reader'))
        evaluation_data_path = evaluation_data_file
        dataset = dataset_reader.read(evaluation_data_path)

        # compute metrics
        dataset.index_instances(model.vocab)
        iterator = DataIterator.from_params(config.pop("iterator"))
        metrics = evaluate_allennlp(model, dataset, iterator, cuda_device)

        return metrics
Ejemplo n.º 16
0
    def __init__(self,
                 target_namespace: str,
                 span_predictor_model, 
                 source_tokenizer: Tokenizer = None,
                 target_tokenizer: Tokenizer = None,
                 source_token_indexers: Dict[str, TokenIndexer] = None,
                 lazy: bool = False,
                 add_rule = True,
                 embed_span = True,
                 add_question = True,
                 add_followup_ques = True,
                 train_using_gold = True)-> None:
        super().__init__(lazy)
        self._target_namespace = target_namespace
        self._source_tokenizer = source_tokenizer or WordTokenizer()
        self._target_tokenizer = target_tokenizer or self._source_tokenizer
        self._source_token_indexers = source_token_indexers or {"tokens": SingleIdTokenIndexer()}
        self.add_rule = add_rule
        self.embed_span = embed_span
        self.add_question = add_question
        self.add_followup_ques = add_followup_ques
        self.train_using_gold = train_using_gold
        if "tokens" not in self._source_token_indexers or \
                not isinstance(self._source_token_indexers["tokens"], SingleIdTokenIndexer):
            raise ConfigurationError("CopyNetDatasetReader expects 'source_token_indexers' to contain "
                                     "a 'single_id' token indexer called 'tokens'.")
        self._target_token_indexers: Dict[str, TokenIndexer] = {
                "tokens": SingleIdTokenIndexer(namespace=self._target_namespace)
        }

        archive = load_archive(span_predictor_model)
        self.dataset_reader = DatasetReader.from_params(archive.config.duplicate()["dataset_reader"])
        self.span_predictor = Predictor.from_archive(archive, 'sharc_predictor')
def load_predictor(serialization_dir, device):
	## Load the model
	archive = load_archive(join(serialization_dir, 'model.tar.gz'))
	model = archive.model.eval()
	if device >= 0: 
		model.to(0)

	## Load the dataset reader
	dataset_reader_params = archive.config.pop('dataset_reader')
	model_name = archive.config.pop('model')['type']

	# Turn off truncation of the inputs
	if model_name == 'gnli':
		pass
		# dataset_reader_params.params['max_premise_length'] = None
		# dataset_reader_params.params['max_hypothesis_length'] = None
	elif model_name == 'bertnli':
		dataset_reader_params.params['max_seq_length'] = None
	else:
		raise ValueError()

	reader = DatasetReader.by_name(dataset_reader_params.pop('type')).from_params(dataset_reader_params)

	predictor = Predictor(model, reader)
	return predictor
Ejemplo n.º 18
0
def run(model_path, test_path, config_path, output_path, batch_size):
    params_path = config_path or os.path.join(model_path, "config.json")

    params = Params.from_file(params_path)
    is_subwords = "tokenizer" in params["reader"] and params["reader"][
        "tokenizer"]["type"] == "subword"
    reader = DatasetReader.from_params(params.pop("reader"))

    device = 0 if torch.cuda.is_available() else -1
    model = Model.load(params, model_path, cuda_device=device)
    model.training = False

    predictor = Seq2SeqPredictor(model, reader)
    with open(output_path, "wt", encoding="utf-8") as w:
        for batch_number, batch in enumerate(get_batches(
                test_path, batch_size)):
            outputs = predictor.predict_batch_json(batch)
            assert len(outputs) == len(batch)
            for output in outputs:
                decoded_words = output["predicted_tokens"]
                if not decoded_words:
                    decoded_words = ["заявил"]
                if not is_subwords:
                    hyp = " ".join(decoded_words)
                else:
                    hyp = "".join(decoded_words).replace("▁", " ").replace(
                        "\n", "").strip()
                if len(hyp) <= 3:
                    hyp = "заявил"
                w.write(hyp + "\n")
Ejemplo n.º 19
0
def evaluate_from_args(args: argparse.Namespace) -> Dict[str, Any]:
    # Disable some of the more verbose logging statements
    logging.getLogger('allennlp.common.params').disabled = True
    logging.getLogger('allennlp.nn.initializers').disabled = True
    logging.getLogger('allennlp.modules.token_embedders.embedding').setLevel(logging.INFO)

    # Load parameter file
    with open(args.config_file) as config_file:
        config = Params(replace_none(json.loads(config_file.read())))

    model = Model.load(config,
                       weights_file=args.weights_file,
                       cuda_device=args.cuda_device)
    model.eval()

    vocab = model._vocab  # pylint: disable=protected-access

    # Load the evaluation data
    dataset_reader = DatasetReader.from_params(config.pop('dataset_reader'))
    evaluation_data_path = args.evaluation_data_file
    logger.info("Reading evaluation data from %s", evaluation_data_path)
    dataset = dataset_reader.read(evaluation_data_path)
    dataset.index_instances(vocab)

    iterator = DataIterator.from_params(config.pop("iterator"))

    metrics = evaluate(model, dataset, iterator, args.cuda_device)

    logger.info("Finished evaluating.")
    logger.info("Metrics:")
    for key, metric in metrics.items():
        logger.info("%s: %s", key, metric)

    return metrics
Ejemplo n.º 20
0
def train(model_path, train_path, val_path, seed, vocabulary_path=None, config_path=None):
    assert os.path.isdir(model_path), "Model directory does not exist"
    set_seed(seed)

    config_path = config_path or os.path.join(model_path, "config.json")
    assert os.path.isfile(config_path), "Config file does not exist"
    params = Params.from_file(config_path)

    vocabulary_path = vocabulary_path or os.path.join(model_path, "vocabulary")
    assert os.path.exists(vocabulary_path), "Vocabulary is not ready, do not forget to run preprocess.py first"
    vocabulary = Vocabulary.from_files(vocabulary_path)

    reader_params = params.duplicate().pop("reader", default=Params({}))
    reader = DatasetReader.from_params(reader_params)
    train_dataset = reader.read(train_path)
    val_dataset = reader.read(val_path) if val_path else None

    model_params = params.pop("model")
    model = Model.from_params(model_params, vocab=vocabulary)
    print(model)
    print("Trainable params count: ", sum(p.numel() for p in model.parameters() if p.requires_grad))

    iterator = DataIterator.from_params(params.pop('iterator'))
    iterator.index_with(vocabulary)
    trainer = Trainer.from_params(model, model_path, iterator,
                                  train_dataset, val_dataset, params.pop('trainer'))
    trainer.train()
Ejemplo n.º 21
0
def datasets_from_params(params: Params) -> Dict[str, InstanceCollection]:
    """
    Load all the datasets specified by the config.
    """
    dataset_reader = DatasetReader.from_params(params.pop('dataset_reader'))

    train_data_path = params.pop('train_data_path')
    logger.info("Reading training data from %s", train_data_path)
    train_data = dataset_reader.read(train_data_path)

    datasets: Dict[str, InstanceCollection] = {"train": train_data}

    validation_data_path = params.pop('validation_data_path', None)
    if validation_data_path is not None:
        logger.info("Reading validation data from %s", validation_data_path)
        validation_data = dataset_reader.read(validation_data_path)
        datasets["validation"] = validation_data

    test_data_path = params.pop("test_data_path", None)
    if test_data_path is not None:
        logger.info("Reading test data from %s", test_data_path)
        test_data = dataset_reader.read(test_data_path)
        datasets["test"] = test_data

    return datasets
 def test_can_read_partial_instances(self, partial_data_path, params: Params) -> None:
     reader = DatasetReader.from_params(params)
     instances = reader.read_partial(partial_data_path)
     vocab = Vocabulary.from_instances(instances)
     assert "partial_labels" in vocab._token_to_index
     assert set(vocab.get_token_to_index_vocabulary("partial_labels").keys()) == \
         set(["I-<UNK>", "O", "I-type", "I-attr", "I-location"])
Ejemplo n.º 23
0
 def _load_reader_and_predictor(basedir):
     config_path = os.path.join(basedir, 'config.json')
     config = Params.from_file(config_path)
     model = Model.load(config=config, serialization_dir=basedir)
     reader = DatasetReader.from_params(config.get('dataset_reader'))
     predictor = Predictor(model=model, dataset_reader=reader)
     return reader, predictor
Ejemplo n.º 24
0
def evaluate_from_args(args: argparse.Namespace) -> Dict[str, Any]:
    # Disable some of the more verbose logging statements
    logging.getLogger('allennlp.common.params').disabled = True
    logging.getLogger('allennlp.nn.initializers').disabled = True
    logging.getLogger('allennlp.modules.token_embedders.embedding').setLevel(
        logging.INFO)

    # Load from archive
    archive = load_archive(args.archive_file, args.cuda_device, args.overrides)
    config = archive.config
    prepare_environment(config)
    model = archive.model
    model.eval()

    # Load the evaluation data
    dataset_reader = DatasetReader.from_params(config.pop('dataset_reader'))
    evaluation_data_path = args.evaluation_data_file
    logger.info("Reading evaluation data from %s", evaluation_data_path)
    dataset = dataset_reader.read(evaluation_data_path)
    dataset.index_instances(model.vocab)

    iterator = DataIterator.from_params(config.pop("iterator"))

    metrics = evaluate(model, dataset, iterator, args.cuda_device)

    logger.info("Finished evaluating.")
    logger.info("Metrics:")
    for key, metric in metrics.items():
        logger.info("%s: %s", key, metric)

    return metrics
Ejemplo n.º 25
0
def evaluate_from_args(args: argparse.Namespace) -> Dict[str, Any]:
    # Disable some of the more verbose logging statements
    logging.getLogger('allennlp.common.params').disabled = True
    logging.getLogger('allennlp.nn.initializers').disabled = True
    logging.getLogger('allennlp.modules.token_embedders.embedding').setLevel(
        logging.INFO)

    # Import any additional modules needed (to register custom classes)
    for package_name in args.include_package:
        import_submodules(package_name)

    # Load from archive
    archive = load_archive(args.archive_file, args.cuda_device, args.overrides,
                           args.weights_file)
    config = archive.config
    prepare_environment(config)
    model = archive.model
    model.eval()

    # Load the evaluation data

    # Try to use the validation dataset reader if there is one - otherwise fall back
    # to the default dataset_reader used for both training and validation.
    validation_dataset_reader_params = config.pop('validation_dataset_reader',
                                                  None)
    if validation_dataset_reader_params is not None:
        dataset_reader = DatasetReader.from_params(
            validation_dataset_reader_params)
    else:
        dataset_reader = DatasetReader.from_params(
            config.pop('dataset_reader'))
    evaluation_data_path = args.evaluation_data_file
    logger.info("Reading evaluation data from %s", evaluation_data_path)
    instances = dataset_reader.read(evaluation_data_path)

    iterator = DataIterator.from_params(config.pop("iterator"))
    iterator.index_with(model.vocab)

    metrics = evaluate(model, instances, iterator, args.cuda_device)

    logger.info("Finished evaluating.")
    logger.info("Metrics:")
    for key, metric in metrics.items():
        logger.info("%s: %s", key, metric)

    return metrics
Ejemplo n.º 26
0
def evaluate(test_path,
             batch_size,
             metric,
             max_count,
             report_every,
             is_multiple_ref=False,
             model_path=None,
             model_config_path=None,
             baseline=None,
             reader_config_path=None,
             detokenize_after=False):
    reader_params = get_reader_params(reader_config_path, model_config_path,
                                      model_path)
    is_subwords = "tokenizer" in reader_params and reader_params["tokenizer"][
        "type"] == "subword"
    reader = DatasetReader.from_params(reader_params)
    run_model = get_model_runner(model_path, reader) if not baseline else None

    hyps = []
    refs = []
    for batch in get_batches(reader, test_path, batch_size):
        batch_refs, batch_hyps = run_model(
            batch) if not baseline else run_baseline(batch, baseline)
        for ref, hyp in zip(batch_refs, batch_hyps):
            hyp = hyp if not is_subwords else "".join(hyp.split(" ")).replace(
                "▁", " ")
            if is_multiple_ref:
                reference_sents = ref.split(" s_s ")
                decoded_sents = hyp.split("s_s")
                hyp = [
                    w.replace("<", "&lt;").replace(">", "&gt;").strip()
                    for w in decoded_sents
                ]
                ref = [
                    w.replace("<", "&lt;").replace(">", "&gt;").strip()
                    for w in reference_sents
                ]
                hyp = " ".join(hyp)
                ref = " ".join(ref)
            ref = ref.strip()
            hyp = hyp.strip()
            if detokenize_after:
                hyp = detokenize(hyp)
                ref = detokenize(ref)
            if isinstance(ref, str) and len(ref) <= 1:
                ref = "some content"
                print("Empty ref")
            if isinstance(hyp, str) and len(hyp) <= 1:
                hyp = "some content"
                print("Empty hyp")

            refs.append(ref)
            hyps.append(hyp)
            if len(hyps) % report_every == 0:
                calc_metrics(refs, hyps, metric)
            if max_count and len(hyps) >= max_count:
                break
    calc_metrics(refs, hyps, metric)
Ejemplo n.º 27
0
def datasets_from_params(params: Params) -> Dict[str, Iterable[Instance]]:
    """
    Load all the datasets specified by the config.
    """
    dataset_reader = DatasetReader.from_params(params.pop('dataset_reader'))
    validation_dataset_reader_params = params.pop("validation_dataset_reader",
                                                  None)

    validation_and_test_dataset_reader: DatasetReader = dataset_reader
    if validation_dataset_reader_params is not None:
        logger.info(
            "Using a separate dataset reader to load validation and test data."
        )
        validation_and_test_dataset_reader = DatasetReader.from_params(
            validation_dataset_reader_params)

    # train_data_path = params.pop('train_data_path')
    # logger.info("Reading training data from %s", train_data_path)
    # train_data = dataset_reader.read(train_data_path)

    # datasets: Dict[str, Iterable[Instance]] = {"train": train_data}

    datasets: Dict[str, Iterable[Instance]] = {}

    validation_data_path = params.pop('validation_data_path', None)
    if validation_data_path is not None:
        logger.info("Reading validation data from %s", validation_data_path)
        validation_data = validation_and_test_dataset_reader.read(
            validation_data_path)
        datasets["validation"] = validation_data

    test_data_path = validation_data_path.replace(
        'development', 'test') if validation_data_path else None
    if test_data_path is not None:
        logger.info("Reading test data from %s", test_data_path)
        test_data = validation_and_test_dataset_reader.read(test_data_path)
        datasets["test"] = test_data

    #other_data_path = params.pop("other_data_path",None)
    #if other_data_path is not None:
    #    logger.info("Reading other data from %s", other_data_path)
    #    other_data_path = validation_and_test_dataset_reader.read(other_data_path)
    #    datasets["other"] = other_data_path

    return datasets
 def test_sentence_markers(self, data_path: str, sentence_marker_params: Params) -> None:
     reader = DatasetReader.from_params(sentence_marker_params)
     instances = reader.read(data_path)
     # vocab = Vocabulary.from_instances(instances)
     for instance in instances:
         tokens = instance["tokens"]
         sentence_markers = instance["metadata"]["sentence_markers"]
         sentences = get_sentences_from_markers(tokens, sentence_markers)
         assert sum(len(x) for x in sentences) == len(tokens) == sentence_markers[-1]
Ejemplo n.º 29
0
def target_to_lines(archive_file, input_file, output_file, lowercase=True):
    archive = load_archive(archive_file)
    reader = DatasetReader.from_params(archive.config.pop("dataset_reader"))
    with open(output_file, "w") as w:
        for t in reader.parse_set(input_file):
            target = t[1]
            target = target.strip()
            target = target.lower() if lowercase else target
            w.write(target.replace("\n", " ") + "\n")
Ejemplo n.º 30
0
def evaluate_from_args(args: argparse.Namespace) -> Dict[str, Any]:
    # Disable some of the more verbose logging statements
    logging.getLogger('allennlp.common.params').disabled = True
    logging.getLogger('allennlp.nn.initializers').disabled = True
    logging.getLogger('allennlp.modules.token_embedders.embedding').setLevel(logging.INFO)

    # Load from archive
    archive = load_archive(args.archive_file, args.cuda_device, args.overrides, args.weights_file)
    config = archive.config
    prepare_environment(config)
    model = archive.model
    model.eval()

    # Load the evaluation data

    # Try to use the validation dataset reader if there is one - otherwise fall back
    # to the default dataset_reader used for both training and validation.
    validation_dataset_reader_params = config.pop('validation_dataset_reader', None)
    if validation_dataset_reader_params is not None:
        dataset_reader = DatasetReader.from_params(validation_dataset_reader_params)
    else:
        dataset_reader = DatasetReader.from_params(config.pop('dataset_reader'))
    evaluation_data_path = args.input_file
    logger.info("Reading evaluation data from %s", evaluation_data_path)
    instances = dataset_reader.read(evaluation_data_path)

    iterator_params = config.pop("validation_iterator", None)
    if iterator_params is None:
        iterator_params = config.pop("iterator")
    iterator = DataIterator.from_params(iterator_params)
    iterator.index_with(model.vocab)

    metrics = evaluate(model, instances, iterator, args.cuda_device, args.batch_weight_key)

    logger.info("Finished evaluating.")
    logger.info("Metrics:")
    for key, metric in metrics.items():
        logger.info("%s: %s", key, metric)

    output_file = args.output_file
    if output_file:
        with open(output_file, "w") as file:
            json.dump(metrics, file, indent=4)
    return metrics
Ejemplo n.º 31
0
def main(config: str, model_th: str, dataset: str, hypo_file: str, ref_file: str,
         batch_size: int, no_gpu: bool):
    logger = logging.getLogger(__name__)

    logger.info("Loading configuration parameters")
    params = Params.from_file(config)

    vocab_params = params.pop("vocabulary")
    vocab = Vocabulary.from_params(vocab_params)

    reader_params = params.pop("dataset_reader")
    reader_name = reader_params.pop("type")
    # reader_params["lazy"] = True  # make sure we do not load the entire dataset

    reader = DatasetReader.by_name(reader_name).from_params(reader_params)

    logger.info("Reading data from {}".format(dataset))
    data = reader.read(dataset)

    iterator = BasicIterator(batch_size=batch_size)
    iterator.index_with(vocab)
    batches = iterator._create_batches(data, shuffle=False)

    logger.info("Loading model")
    model_params = params.pop("model")
    model_name = model_params.pop("type")
    model = Model.by_name(model_name).from_params(model_params, vocab=vocab)
    if not no_gpu:
        model.cuda(0)

    with open(model_th, 'rb') as f:
        if no_gpu:
            state_dict = torch.load(f, map_location=torch.device('cpu'))
        else:
            state_dict = torch.load(f)

    model.load_state_dict(state_dict)

    predictor = Seq2SeqPredictor(model, reader)
    model.eval()

    with open(hypo_file, 'w') as hf, open(ref_file, 'w') as rf:
        logger.info("Generating predictions")
        for sample in tqdm(batches):
            s = list(sample)
            pred = predictor.predict_batch_instance(s)

            for inst, p in zip(s, pred):
                print(
                    " ".join(p["predicted_tokens"][0]),
                    file=hf
                )
                print(
                    " ".join(t.text for t in inst["target_tokens"][1:-1]),
                    file=rf
                )
Ejemplo n.º 32
0
def preprocess(train_path, vocabulary_path, config_path):
    params = Params.from_file(config_path)

    reader_params = params.pop("reader", default=Params({}))
    reader = DatasetReader.from_params(reader_params)
    dataset = reader.read(train_path)

    vocabulary_params = params.pop("vocabulary", default=Params({}))
    vocabulary = Vocabulary.from_params(vocabulary_params, instances=dataset)
    vocabulary.save_to_files(vocabulary_path)
def _worker(reader: DatasetReader,
            input_queue: Queue,
            output_queue: Queue,
            index: int) -> None:
    """
    A worker that pulls filenames off the input queue, uses the dataset reader
    to read them, and places the generated instances on the output queue.
    When there are no filenames left on the input queue, it puts its ``index``
    on the output queue and doesn't do anything else.
    """
    # Keep going until you get a file_path that's None.
    while True:
        file_path = input_queue.get()
        if file_path is None:
            # Put my index on the queue to signify that I'm finished
            output_queue.put(index)
            break

        logger.info(f"reading instances from {file_path}")
        for instance in reader.read(file_path):
            output_queue.put(instance)
Ejemplo n.º 34
0
 def test_registry_has_builtin_dataset_readers(self):
     assert DatasetReader.by_name('snli').__name__ == 'SnliReader'
     assert DatasetReader.by_name('sequence_tagging').__name__ == 'SequenceTaggingDatasetReader'
     assert DatasetReader.by_name('language_modeling').__name__ == 'LanguageModelingReader'
     assert DatasetReader.by_name('squad').__name__ == 'SquadReader'