예제 #1
0
def datasets_from_params(params        )                                 :
    u"""
    Load all the datasets specified by the config.
    """
    dataset_reader = DatasetReader.from_params(params.pop(u'dataset_reader'))
    validation_dataset_reader_params = params.pop(u"validation_dataset_reader", None)

    validation_and_test_dataset_reader                = dataset_reader
    if validation_dataset_reader_params is not None:
        logger.info(u"Using a separate dataset reader to load validation and test data.")
        validation_and_test_dataset_reader = DatasetReader.from_params(validation_dataset_reader_params)

    train_data_path = params.pop(u'train_data_path')
    logger.info(u"Reading training data from %s", train_data_path)
    train_data = dataset_reader.read(train_data_path)

    datasets                                = {u"train": train_data}

    validation_data_path = params.pop(u'validation_data_path', None)
    if validation_data_path is not None:
        logger.info(u"Reading validation data from %s", validation_data_path)
        validation_data = validation_and_test_dataset_reader.read(validation_data_path)
        datasets[u"validation"] = validation_data

    test_data_path = params.pop(u"test_data_path", None)
    if test_data_path is not None:
        logger.info(u"Reading test data from %s", test_data_path)
        test_data = validation_and_test_dataset_reader.read(test_data_path)
        datasets[u"test"] = test_data

    return datasets
예제 #2
0
def datasets_from_params(params: Params) -> Dict[str, Iterable[Instance]]:
    """
    Load all the datasets specified by the config.
    """
    dataset_reader = DatasetReader.from_params(params.pop('dataset_reader'))
    validation_dataset_reader_params = params.pop("validation_dataset_reader", None)

    validation_and_test_dataset_reader: DatasetReader = dataset_reader
    if validation_dataset_reader_params is not None:
        logger.info("Using a separate dataset reader to load validation and test data.")
        validation_and_test_dataset_reader = DatasetReader.from_params(validation_dataset_reader_params)

    train_data_path = params.pop('train_data_path')
    logger.info("Reading training data from %s", train_data_path)
    train_data = dataset_reader.read(train_data_path)

    datasets: Dict[str, Iterable[Instance]] = {"train": train_data}

    validation_data_path = params.pop('validation_data_path', None)
    if validation_data_path is not None:
        logger.info("Reading validation data from %s", validation_data_path)
        validation_data = validation_and_test_dataset_reader.read(validation_data_path)
        datasets["validation"] = validation_data

    test_data_path = params.pop("test_data_path", None)
    if test_data_path is not None:
        logger.info("Reading test data from %s", test_data_path)
        test_data = validation_and_test_dataset_reader.read(test_data_path)
        datasets["test"] = test_data

    return datasets
예제 #3
0
def datasets_from_params(params: Params) -> Dict[str, Iterable[Instance]]:
    """
    Load all the datasets specified by the config.
    """
    dataset_reader = DatasetReader.from_params(params.pop('dataset_reader'))
    validation_dataset_reader_params = params.pop("validation_dataset_reader", None)

    validation_and_test_dataset_reader: DatasetReader = dataset_reader
    if validation_dataset_reader_params is not None:
        logger.info("Using a separate dataset reader to load validation and test data.")
        validation_and_test_dataset_reader = DatasetReader.from_params(validation_dataset_reader_params)

    train_data_path = params.pop('train_data_path')
    logger.info("Reading training data from %s", train_data_path)
    train_data = dataset_reader.read(train_data_path)

    datasets: Dict[str, Iterable[Instance]] = {"train": train_data}

    validation_data_path = params.pop('validation_data_path', None)
    if validation_data_path is not None:
        logger.info("Reading validation data from %s", validation_data_path)
        validation_data = validation_and_test_dataset_reader.read(validation_data_path)
        datasets["validation"] = validation_data

    test_data_path = params.pop("test_data_path", None)
    if test_data_path is not None:
        logger.info("Reading test data from %s", test_data_path)
        test_data = validation_and_test_dataset_reader.read(test_data_path)
        datasets["test"] = test_data

    return datasets
예제 #4
0
def evaluate_from_args(args: argparse.Namespace) -> Dict[str, Any]:
    # Disable some of the more verbose logging statements

    logging.getLogger('allennlp.common.params').disabled = True
    logging.getLogger('allennlp.nn.initializers').disabled = True
    logging.getLogger('allennlp.modules.token_embedders.embedding').setLevel(logging.INFO)

    # Load from archive
    archive = load_archive(args.archive_file, args.cuda_device, args.overrides, args.weights_file)
    config = archive.config
    prepare_environment(config)
    model = archive.model
    model.eval()

    # Load the evaluation data

    # Try to use the validation dataset reader if there is one - otherwise fall back
    # to the default dataset_reader used for both training and validation.
    validation_dataset_reader_params = config.pop('validation_dataset_reader', None)
    if validation_dataset_reader_params is not None:
        dataset_reader = DatasetReader.from_params(validation_dataset_reader_params)
    else:
        dataset_reader = DatasetReader.from_params(config.pop('dataset_reader'))
    evaluation_data_path = args.input_file
    logger.info("Reading evaluation data from %s", evaluation_data_path)
    instances = dataset_reader.read(evaluation_data_path)

    iterator_params = config.pop("validation_iterator", None)
    if iterator_params is None:
        iterator_params = config.pop("iterator")
    iterator = DataIterator.from_params(iterator_params)
    iterator.index_with(model.vocab)
예제 #5
0
def evaluate_from_args(args: argparse.Namespace) -> Dict[str, Any]:
    # Disable some of the more verbose logging statements
    logging.getLogger("allennlp.common.params").disabled = True
    logging.getLogger("allennlp.nn.initializers").disabled = True
    logging.getLogger("allennlp.modules.token_embedders.embedding").setLevel(
        logging.INFO)

    # Load from archive
    archive = load_archive(
        args.archive_file,
        weights_file=args.weights_file,
        cuda_device=args.cuda_device,
        overrides=args.overrides,
    )
    config = archive.config
    prepare_environment(config)
    model = archive.model
    model.eval()

    # Load the evaluation data

    # Try to use the validation dataset reader if there is one - otherwise fall back
    # to the default dataset_reader used for both training and validation.
    validation_dataset_reader_params = config.pop("validation_dataset_reader",
                                                  None)
    if validation_dataset_reader_params is not None:
        dataset_reader = DatasetReader.from_params(
            validation_dataset_reader_params)
    else:
        dataset_reader = DatasetReader.from_params(
            config.pop("dataset_reader"))
    evaluation_data_path = args.input_file
    logger.info("Reading evaluation data from %s", evaluation_data_path)
    instances = dataset_reader.read(evaluation_data_path)

    embedding_sources = (json.loads(args.embedding_sources_mapping)
                         if args.embedding_sources_mapping else {})

    if args.extend_vocab:
        logger.info("Vocabulary is being extended with test instances.")
        model.vocab.extend_from_instances(instances=instances)
        model.extend_embedder_vocab(embedding_sources)

    instances.index_with(model.vocab)
    data_loader_params = config.pop("validation_data_loader", None)
    if data_loader_params is None:
        data_loader_params = config.pop("data_loader")
    if args.batch_size:
        data_loader_params["batch_size"] = args.batch_size
    data_loader = DataLoader.from_params(dataset=instances,
                                         params=data_loader_params)

    metrics = evaluate(model, data_loader, args.cuda_device,
                       args.batch_weight_key)

    logger.info("Finished evaluating.")

    dump_metrics(args.output_file, metrics, log=True)

    return metrics
예제 #6
0
def evaluate_from_args(args: argparse.Namespace) -> Dict[str, Any]:
    # Disable some of the more verbose logging statements
    logging.getLogger('allennlp.common.params').disabled = True
    logging.getLogger('allennlp.nn.initializers').disabled = True
    logging.getLogger('allennlp.modules.token_embedders.embedding').setLevel(
        logging.INFO)

    # Load from archive
    archive = load_archive(args.archive_file, args.cuda_device, args.overrides,
                           args.weights_file)
    config = archive.config
    prepare_environment(config)
    model = archive.model
    model.eval()

    # Load the evaluation data

    # Try to use the validation dataset reader if there is one - otherwise fall back
    # to the default dataset_reader used for both training and validation.
    validation_dataset_reader_params = config.pop('validation_dataset_reader',
                                                  None)
    if validation_dataset_reader_params is not None:
        dataset_reader = DatasetReader.from_params(
            validation_dataset_reader_params)
    else:
        dataset_reader = DatasetReader.from_params(
            config.pop('dataset_reader'))
    evaluation_data_path = args.input_file
    logger.info("Reading evaluation data from %s", evaluation_data_path)
    instances = dataset_reader.read(evaluation_data_path)

    embedding_sources: Dict[str,
                            str] = (json.loads(args.embedding_sources_mapping)
                                    if args.embedding_sources_mapping else {})
    if args.extend_vocab:
        logger.info("Vocabulary is being extended with test instances.")
        model.vocab.extend_from_instances(Params({}), instances=instances)
        model.extend_embedder_vocab(embedding_sources)

    iterator_params = config.pop("validation_iterator", None)
    if iterator_params is None:
        iterator_params = config.pop("iterator")
    iterator = DataIterator.from_params(iterator_params)
    iterator.index_with(model.vocab)

    metrics = evaluate(model, instances, iterator, args.cuda_device,
                       args.batch_weight_key)

    logger.info("Finished evaluating.")
    logger.info("Metrics:")
    for key, metric in metrics.items():
        logger.info("%s: %s", key, metric)

    output_file = args.output_file
    if output_file:
        with open(output_file, "w") as file:
            json.dump(metrics, file, indent=4)
    return metrics
예제 #7
0
def evaluate_from_args(args: argparse.Namespace) -> Dict[str, Any]:
    # Disable some of the more verbose logging statements
    logging.getLogger('allennlp.common.params').disabled = True
    logging.getLogger('allennlp.nn.initializers').disabled = True
    logging.getLogger('allennlp.modules.token_embedders.embedding').setLevel(
        logging.INFO)

    # Load model from archive
    model_archive = load_archive(args.model_archive_file, args.cuda_device,
                                 args.overrides, args.weights_file)
    config = model_archive.config
    prepare_environment(config)
    model = model_archive.model
    model.eval()

    # Load sampler
    sampler_archive = load_archive(args.sampler_archive_file, args.cuda_device,
                                   args.overrides, args.weights_file)
    sampler = sampler_archive.model
    sampler.eval()

    # Load the evaluation data. NOTE: We are using the model's reader!
    validation_dataset_reader_params = config.pop('validation_dataset_reader',
                                                  None)
    if validation_dataset_reader_params is not None:
        dataset_reader = DatasetReader.from_params(
            validation_dataset_reader_params)
    else:
        dataset_reader = DatasetReader.from_params(
            config.pop('dataset_reader'))
    evaluation_data_path = args.input_file
    logger.info('Reading evaluation data from: %s', evaluation_data_path)
    instances = dataset_reader.read(evaluation_data_path)

    # To avoid hairy issues with splitting, we opt to use a basic iterator so that we can
    # generate samples for entire sequences.
    iterator_params = config.pop('iterator', 'None')
    iterator = DataIterator.from_params(iterator_params)
    iterator.index_with(model.vocab)
    # iterator.eval()
    metrics = evaluate_perplexity(model, sampler, args.num_samples, instances,
                                  iterator, args.cuda_device)

    logger.info('Finished evaluating.')
    logger.info('Metrics:')
    for key, metric in metrics.items():
        logger.info('%s: %s', key, metric)

    output_file = args.output_file
    if output_file:
        with open(output_file, 'w') as f:
            json.dump(metrics, f, indent=4)
    return metrics
def datasets_from_params(params: Params) -> Dict[str, Iterable[Instance]]:
    """
    Load all the datasets specified by the config.
    """
    fully_labelled_threshold = 3000 if 'fully_labelled_threshold' not in params[
        'dataset_reader'] else params['dataset_reader'][
            'fully_labelled_threshold']
    dataset_reader = DatasetReader.from_params(
        params.pop("dataset_reader", None))
    validation_dataset_reader_params = params.pop("validation_dataset_reader",
                                                  None)

    validation_and_test_dataset_reader: DatasetReader = dataset_reader
    if validation_dataset_reader_params is not None:
        logger.info(
            "Using a separate dataset reader to load validation and test data."
        )
        validation_and_test_dataset_reader = DatasetReader.from_params(
            validation_dataset_reader_params)

    # Split train data into held out/not held out, initializing to 10% non-held-out
    # non-held-out training data will have 100% of labels (using dataset_reader)
    # held-out training data will have only 50% of labels (using held_out_dataset_reader)
    train_data_path = params.pop('train_data_path')
    logger.info("Reading training data from %s", train_data_path)
    train_data = dataset_reader.read(train_data_path)

    held_out_train_data = train_data[
        fully_labelled_threshold:]  # after threshold
    train_data = train_data[:fully_labelled_threshold]  # before threshold

    datasets: Dict[str, Iterable[Instance]] = {
        "train": train_data,
        "held_out_train": held_out_train_data
    }

    validation_data_path = params.pop('validation_data_path', None)
    if validation_data_path is not None:
        logger.info("Reading validation data from %s", validation_data_path)
        validation_data = validation_and_test_dataset_reader.read(
            validation_data_path)
        datasets["validation"] = validation_data

    test_data_path = params.pop("test_data_path", None)
    if test_data_path is not None:
        logger.info("Reading test data from %s", test_data_path)
        test_data = validation_and_test_dataset_reader.read(test_data_path)
        datasets["test"] = test_data

    return datasets
예제 #9
0
def evaluate_from_args(args):
    # Disable some of the more verbose logging statements
    logging.getLogger(u'allennlp.common.params').disabled = True
    logging.getLogger(u'allennlp.nn.initializers').disabled = True
    logging.getLogger(u'allennlp.modules.token_embedders.embedding').setLevel(
        logging.INFO)

    # Load from archive
    archive = load_archive(args.archive_file, args.cuda_device, args.overrides,
                           args.weights_file)
    config = archive.config
    prepare_environment(config)
    model = archive.model
    model.eval()

    # Load the evaluation data

    # Try to use the validation dataset reader if there is one - otherwise fall back
    # to the default dataset_reader used for both training and validation.
    validation_dataset_reader_params = config.pop(u'validation_dataset_reader',
                                                  None)
    if validation_dataset_reader_params is not None:
        dataset_reader = DatasetReader.from_params(
            validation_dataset_reader_params)
    else:
        dataset_reader = DatasetReader.from_params(
            config.pop(u'dataset_reader'))
    evaluation_data_path = args.input_file
    logger.info(u"Reading evaluation data from %s", evaluation_data_path)
    instances = dataset_reader.read(evaluation_data_path)

    iterator_params = config.pop(u"validation_iterator", None)
    if iterator_params is None:
        iterator_params = config.pop(u"iterator")
    iterator = DataIterator.from_params(iterator_params)
    iterator.index_with(model.vocab)

    metrics = evaluate(model, instances, iterator, args.cuda_device)

    logger.info(u"Finished evaluating.")
    logger.info(u"Metrics:")
    for key, metric in list(metrics.items()):
        logger.info(u"%s: %s", key, metric)

    output_file = args.output_file
    if output_file:
        with open(output_file, u"w") as file:
            json.dump(metrics, file, indent=4)
    return metrics
예제 #10
0
def evaluate_from_args(args: argparse.Namespace) -> Dict[str, Any]:
    # Disable some of the more verbose logging statements
    logging.getLogger('allennlp.common.params').disabled = True
    logging.getLogger('allennlp.nn.initializers').disabled = True
    logging.getLogger('allennlp.modules.token_embedders.embedding').setLevel(
        logging.INFO)

    # Load from archive
    archive = load_archive(args.archive_file, args.cuda_device, args.overrides,
                           args.weights_file)
    config = archive.config
    prepare_environment(config)
    model = archive.model
    model.eval()

    # Set the model to error analysis mode
    model.error_analysis = True

    # Load the evaluation data

    # Try to use the validation dataset reader if there is one - otherwise fall back
    # to the default dataset_reader used for both training and validation.
    validation_dataset_reader_params = config.pop('validation_dataset_reader',
                                                  None)
    if validation_dataset_reader_params is not None:
        dataset_reader = DatasetReader.from_params(
            validation_dataset_reader_params)
    else:
        dataset_reader = DatasetReader.from_params(
            config.pop('dataset_reader'))
    evaluation_data_path = args.evaluation_data_file
    logger.info("Reading evaluation data from %s", evaluation_data_path)
    instances = dataset_reader.read(evaluation_data_path)

    iterator = DataIterator.from_params(config.pop("iterator"))
    iterator.index_with(model.vocab)

    metrics = evaluate(model, instances, iterator, args.cuda_device)

    logger.info("Finished evaluating.")
    print("All Metrics")
    print("=" * 79)
    for key, metric in metrics.items():
        print("{}\t{}".format(key, metric))

    # Turn off error analysis mode
    model.error_analysis = False
    return metrics
 def test_can_read_partial_instances(self, partial_data_path, params: Params) -> None:
     reader = DatasetReader.from_params(params)
     instances = reader.read_partial(partial_data_path)
     vocab = Vocabulary.from_instances(instances)
     assert "partial_labels" in vocab._token_to_index
     assert set(vocab.get_token_to_index_vocabulary("partial_labels").keys()) == \
         set(["I-<UNK>", "O", "I-type", "I-attr", "I-location"])
예제 #12
0
def datasets_from_params(params: Params) -> Dict[str, InstanceCollection]:
    """
    Load all the datasets specified by the config.
    """
    dataset_reader = DatasetReader.from_params(params.pop('dataset_reader'))

    train_data_path = params.pop('train_data_path')
    logger.info("Reading training data from %s", train_data_path)
    train_data = dataset_reader.read(train_data_path)

    datasets: Dict[str, InstanceCollection] = {"train": train_data}

    validation_data_path = params.pop('validation_data_path', None)
    if validation_data_path is not None:
        logger.info("Reading validation data from %s", validation_data_path)
        validation_data = dataset_reader.read(validation_data_path)
        datasets["validation"] = validation_data

    test_data_path = params.pop("test_data_path", None)
    if test_data_path is not None:
        logger.info("Reading test data from %s", test_data_path)
        test_data = dataset_reader.read(test_data_path)
        datasets["test"] = test_data

    return datasets
예제 #13
0
def evaluate_from_args(args: argparse.Namespace) -> Dict[str, Any]:
    # Disable some of the more verbose logging statements
    logging.getLogger('allennlp.common.params').disabled = True
    logging.getLogger('allennlp.nn.initializers').disabled = True
    logging.getLogger('allennlp.modules.token_embedders.embedding').setLevel(logging.INFO)

    # Load parameter file
    with open(args.config_file) as config_file:
        config = Params(replace_none(json.loads(config_file.read())))

    model = Model.load(config,
                       weights_file=args.weights_file,
                       cuda_device=args.cuda_device)
    model.eval()

    vocab = model._vocab  # pylint: disable=protected-access

    # Load the evaluation data
    dataset_reader = DatasetReader.from_params(config.pop('dataset_reader'))
    evaluation_data_path = args.evaluation_data_file
    logger.info("Reading evaluation data from %s", evaluation_data_path)
    dataset = dataset_reader.read(evaluation_data_path)
    dataset.index_instances(vocab)

    iterator = DataIterator.from_params(config.pop("iterator"))

    metrics = evaluate(model, dataset, iterator, args.cuda_device)

    logger.info("Finished evaluating.")
    logger.info("Metrics:")
    for key, metric in metrics.items():
        logger.info("%s: %s", key, metric)

    return metrics
예제 #14
0
def run(model_path, test_path, config_path, output_path, batch_size):
    params_path = config_path or os.path.join(model_path, "config.json")

    params = Params.from_file(params_path)
    is_subwords = "tokenizer" in params["reader"] and params["reader"][
        "tokenizer"]["type"] == "subword"
    reader = DatasetReader.from_params(params.pop("reader"))

    device = 0 if torch.cuda.is_available() else -1
    model = Model.load(params, model_path, cuda_device=device)
    model.training = False

    predictor = Seq2SeqPredictor(model, reader)
    with open(output_path, "wt", encoding="utf-8") as w:
        for batch_number, batch in enumerate(get_batches(
                test_path, batch_size)):
            outputs = predictor.predict_batch_json(batch)
            assert len(outputs) == len(batch)
            for output in outputs:
                decoded_words = output["predicted_tokens"]
                if not decoded_words:
                    decoded_words = ["заявил"]
                if not is_subwords:
                    hyp = " ".join(decoded_words)
                else:
                    hyp = "".join(decoded_words).replace("▁", " ").replace(
                        "\n", "").strip()
                if len(hyp) <= 3:
                    hyp = "заявил"
                w.write(hyp + "\n")
예제 #15
0
    def __init__(self,
                 target_namespace: str,
                 span_predictor_model, 
                 source_tokenizer: Tokenizer = None,
                 target_tokenizer: Tokenizer = None,
                 source_token_indexers: Dict[str, TokenIndexer] = None,
                 lazy: bool = False,
                 add_rule = True,
                 embed_span = True,
                 add_question = True,
                 add_followup_ques = True,
                 train_using_gold = True)-> None:
        super().__init__(lazy)
        self._target_namespace = target_namespace
        self._source_tokenizer = source_tokenizer or WordTokenizer()
        self._target_tokenizer = target_tokenizer or self._source_tokenizer
        self._source_token_indexers = source_token_indexers or {"tokens": SingleIdTokenIndexer()}
        self.add_rule = add_rule
        self.embed_span = embed_span
        self.add_question = add_question
        self.add_followup_ques = add_followup_ques
        self.train_using_gold = train_using_gold
        if "tokens" not in self._source_token_indexers or \
                not isinstance(self._source_token_indexers["tokens"], SingleIdTokenIndexer):
            raise ConfigurationError("CopyNetDatasetReader expects 'source_token_indexers' to contain "
                                     "a 'single_id' token indexer called 'tokens'.")
        self._target_token_indexers: Dict[str, TokenIndexer] = {
                "tokens": SingleIdTokenIndexer(namespace=self._target_namespace)
        }

        archive = load_archive(span_predictor_model)
        self.dataset_reader = DatasetReader.from_params(archive.config.duplicate()["dataset_reader"])
        self.span_predictor = Predictor.from_archive(archive, 'sharc_predictor')
예제 #16
0
def train(model_path, train_path, val_path, seed, vocabulary_path=None, config_path=None):
    assert os.path.isdir(model_path), "Model directory does not exist"
    set_seed(seed)

    config_path = config_path or os.path.join(model_path, "config.json")
    assert os.path.isfile(config_path), "Config file does not exist"
    params = Params.from_file(config_path)

    vocabulary_path = vocabulary_path or os.path.join(model_path, "vocabulary")
    assert os.path.exists(vocabulary_path), "Vocabulary is not ready, do not forget to run preprocess.py first"
    vocabulary = Vocabulary.from_files(vocabulary_path)

    reader_params = params.duplicate().pop("reader", default=Params({}))
    reader = DatasetReader.from_params(reader_params)
    train_dataset = reader.read(train_path)
    val_dataset = reader.read(val_path) if val_path else None

    model_params = params.pop("model")
    model = Model.from_params(model_params, vocab=vocabulary)
    print(model)
    print("Trainable params count: ", sum(p.numel() for p in model.parameters() if p.requires_grad))

    iterator = DataIterator.from_params(params.pop('iterator'))
    iterator.index_with(vocabulary)
    trainer = Trainer.from_params(model, model_path, iterator,
                                  train_dataset, val_dataset, params.pop('trainer'))
    trainer.train()
예제 #17
0
def evaluate_from_args(args: argparse.Namespace) -> Dict[str, Any]:
    # Disable some of the more verbose logging statements
    logging.getLogger('allennlp.common.params').disabled = True
    logging.getLogger('allennlp.nn.initializers').disabled = True
    logging.getLogger('allennlp.modules.token_embedders.embedding').setLevel(
        logging.INFO)

    # Load from archive
    archive = load_archive(args.archive_file, args.cuda_device, args.overrides)
    config = archive.config
    prepare_environment(config)
    model = archive.model
    model.eval()

    # Load the evaluation data
    dataset_reader = DatasetReader.from_params(config.pop('dataset_reader'))
    evaluation_data_path = args.evaluation_data_file
    logger.info("Reading evaluation data from %s", evaluation_data_path)
    dataset = dataset_reader.read(evaluation_data_path)
    dataset.index_instances(model.vocab)

    iterator = DataIterator.from_params(config.pop("iterator"))

    metrics = evaluate(model, dataset, iterator, args.cuda_device)

    logger.info("Finished evaluating.")
    logger.info("Metrics:")
    for key, metric in metrics.items():
        logger.info("%s: %s", key, metric)

    return metrics
예제 #18
0
    def _evaluate_nn(self, model_path: str, evaluation_data_file: str,
                     cuda_device: int):
        """

        :param model_path:
        :param evaluation_data_file:
        :param cuda_device:
        :return:
        """
        # import allennlp ontoemma classes (to register -- necessary, do not remove)
        from emma.allennlp_classes.ontoemma_dataset_reader import OntologyMatchingDatasetReader
        from emma.allennlp_classes.ontoemma_model import OntoEmmaNN

        # Load from archive
        archive = load_archive(model_path, cuda_device)
        config = archive.config
        prepare_environment(config)
        model = archive.model
        model.eval()

        # Load the evaluation data
        dataset_reader = DatasetReader.from_params(
            config.pop('dataset_reader'))
        evaluation_data_path = evaluation_data_file
        dataset = dataset_reader.read(evaluation_data_path)

        # compute metrics
        dataset.index_instances(model.vocab)
        iterator = DataIterator.from_params(config.pop("iterator"))
        metrics = evaluate_allennlp(model, dataset, iterator, cuda_device)

        return metrics
예제 #19
0
파일: allen.py 프로젝트: phillswope/charade
 def _load_reader_and_predictor(basedir):
     config_path = os.path.join(basedir, 'config.json')
     config = Params.from_file(config_path)
     model = Model.load(config=config, serialization_dir=basedir)
     reader = DatasetReader.from_params(config.get('dataset_reader'))
     predictor = Predictor(model=model, dataset_reader=reader)
     return reader, predictor
예제 #20
0
def evaluate_from_args(args: argparse.Namespace) -> Dict[str, Any]:
    # Disable some of the more verbose logging statements
    logging.getLogger('allennlp.common.params').disabled = True
    logging.getLogger('allennlp.nn.initializers').disabled = True
    logging.getLogger('allennlp.modules.token_embedders.embedding').setLevel(
        logging.INFO)

    # Import any additional modules needed (to register custom classes)
    for package_name in args.include_package:
        import_submodules(package_name)

    # Load from archive
    archive = load_archive(args.archive_file, args.cuda_device, args.overrides,
                           args.weights_file)
    config = archive.config
    prepare_environment(config)
    model = archive.model
    model.eval()

    # Load the evaluation data

    # Try to use the validation dataset reader if there is one - otherwise fall back
    # to the default dataset_reader used for both training and validation.
    validation_dataset_reader_params = config.pop('validation_dataset_reader',
                                                  None)
    if validation_dataset_reader_params is not None:
        dataset_reader = DatasetReader.from_params(
            validation_dataset_reader_params)
    else:
        dataset_reader = DatasetReader.from_params(
            config.pop('dataset_reader'))
    evaluation_data_path = args.evaluation_data_file
    logger.info("Reading evaluation data from %s", evaluation_data_path)
    instances = dataset_reader.read(evaluation_data_path)

    iterator = DataIterator.from_params(config.pop("iterator"))
    iterator.index_with(model.vocab)

    metrics = evaluate(model, instances, iterator, args.cuda_device)

    logger.info("Finished evaluating.")
    logger.info("Metrics:")
    for key, metric in metrics.items():
        logger.info("%s: %s", key, metric)

    return metrics
예제 #21
0
def evaluate(test_path,
             batch_size,
             metric,
             max_count,
             report_every,
             is_multiple_ref=False,
             model_path=None,
             model_config_path=None,
             baseline=None,
             reader_config_path=None,
             detokenize_after=False):
    reader_params = get_reader_params(reader_config_path, model_config_path,
                                      model_path)
    is_subwords = "tokenizer" in reader_params and reader_params["tokenizer"][
        "type"] == "subword"
    reader = DatasetReader.from_params(reader_params)
    run_model = get_model_runner(model_path, reader) if not baseline else None

    hyps = []
    refs = []
    for batch in get_batches(reader, test_path, batch_size):
        batch_refs, batch_hyps = run_model(
            batch) if not baseline else run_baseline(batch, baseline)
        for ref, hyp in zip(batch_refs, batch_hyps):
            hyp = hyp if not is_subwords else "".join(hyp.split(" ")).replace(
                "▁", " ")
            if is_multiple_ref:
                reference_sents = ref.split(" s_s ")
                decoded_sents = hyp.split("s_s")
                hyp = [
                    w.replace("<", "&lt;").replace(">", "&gt;").strip()
                    for w in decoded_sents
                ]
                ref = [
                    w.replace("<", "&lt;").replace(">", "&gt;").strip()
                    for w in reference_sents
                ]
                hyp = " ".join(hyp)
                ref = " ".join(ref)
            ref = ref.strip()
            hyp = hyp.strip()
            if detokenize_after:
                hyp = detokenize(hyp)
                ref = detokenize(ref)
            if isinstance(ref, str) and len(ref) <= 1:
                ref = "some content"
                print("Empty ref")
            if isinstance(hyp, str) and len(hyp) <= 1:
                hyp = "some content"
                print("Empty hyp")

            refs.append(ref)
            hyps.append(hyp)
            if len(hyps) % report_every == 0:
                calc_metrics(refs, hyps, metric)
            if max_count and len(hyps) >= max_count:
                break
    calc_metrics(refs, hyps, metric)
예제 #22
0
def datasets_from_params(params: Params) -> Dict[str, Iterable[Instance]]:
    """
    Load all the datasets specified by the config.
    """
    dataset_reader = DatasetReader.from_params(params.pop('dataset_reader'))
    validation_dataset_reader_params = params.pop("validation_dataset_reader",
                                                  None)

    validation_and_test_dataset_reader: DatasetReader = dataset_reader
    if validation_dataset_reader_params is not None:
        logger.info(
            "Using a separate dataset reader to load validation and test data."
        )
        validation_and_test_dataset_reader = DatasetReader.from_params(
            validation_dataset_reader_params)

    # train_data_path = params.pop('train_data_path')
    # logger.info("Reading training data from %s", train_data_path)
    # train_data = dataset_reader.read(train_data_path)

    # datasets: Dict[str, Iterable[Instance]] = {"train": train_data}

    datasets: Dict[str, Iterable[Instance]] = {}

    validation_data_path = params.pop('validation_data_path', None)
    if validation_data_path is not None:
        logger.info("Reading validation data from %s", validation_data_path)
        validation_data = validation_and_test_dataset_reader.read(
            validation_data_path)
        datasets["validation"] = validation_data

    test_data_path = validation_data_path.replace(
        'development', 'test') if validation_data_path else None
    if test_data_path is not None:
        logger.info("Reading test data from %s", test_data_path)
        test_data = validation_and_test_dataset_reader.read(test_data_path)
        datasets["test"] = test_data

    #other_data_path = params.pop("other_data_path",None)
    #if other_data_path is not None:
    #    logger.info("Reading other data from %s", other_data_path)
    #    other_data_path = validation_and_test_dataset_reader.read(other_data_path)
    #    datasets["other"] = other_data_path

    return datasets
 def test_sentence_markers(self, data_path: str, sentence_marker_params: Params) -> None:
     reader = DatasetReader.from_params(sentence_marker_params)
     instances = reader.read(data_path)
     # vocab = Vocabulary.from_instances(instances)
     for instance in instances:
         tokens = instance["tokens"]
         sentence_markers = instance["metadata"]["sentence_markers"]
         sentences = get_sentences_from_markers(tokens, sentence_markers)
         assert sum(len(x) for x in sentences) == len(tokens) == sentence_markers[-1]
예제 #24
0
def target_to_lines(archive_file, input_file, output_file, lowercase=True):
    archive = load_archive(archive_file)
    reader = DatasetReader.from_params(archive.config.pop("dataset_reader"))
    with open(output_file, "w") as w:
        for t in reader.parse_set(input_file):
            target = t[1]
            target = target.strip()
            target = target.lower() if lowercase else target
            w.write(target.replace("\n", " ") + "\n")
예제 #25
0
def evaluate_from_args(args: argparse.Namespace) -> Dict[str, Any]:
    # Disable some of the more verbose logging statements
    logging.getLogger('allennlp.common.params').disabled = True
    logging.getLogger('allennlp.nn.initializers').disabled = True
    logging.getLogger('allennlp.modules.token_embedders.embedding').setLevel(logging.INFO)

    # Load from archive
    archive = load_archive(args.archive_file, args.cuda_device, args.overrides, args.weights_file)
    config = archive.config
    prepare_environment(config)
    model = archive.model
    model.eval()

    # Load the evaluation data

    # Try to use the validation dataset reader if there is one - otherwise fall back
    # to the default dataset_reader used for both training and validation.
    validation_dataset_reader_params = config.pop('validation_dataset_reader', None)
    if validation_dataset_reader_params is not None:
        dataset_reader = DatasetReader.from_params(validation_dataset_reader_params)
    else:
        dataset_reader = DatasetReader.from_params(config.pop('dataset_reader'))
    evaluation_data_path = args.input_file
    logger.info("Reading evaluation data from %s", evaluation_data_path)
    instances = dataset_reader.read(evaluation_data_path)

    iterator_params = config.pop("validation_iterator", None)
    if iterator_params is None:
        iterator_params = config.pop("iterator")
    iterator = DataIterator.from_params(iterator_params)
    iterator.index_with(model.vocab)

    metrics = evaluate(model, instances, iterator, args.cuda_device, args.batch_weight_key)

    logger.info("Finished evaluating.")
    logger.info("Metrics:")
    for key, metric in metrics.items():
        logger.info("%s: %s", key, metric)

    output_file = args.output_file
    if output_file:
        with open(output_file, "w") as file:
            json.dump(metrics, file, indent=4)
    return metrics
예제 #26
0
def preprocess(train_path, vocabulary_path, config_path):
    params = Params.from_file(config_path)

    reader_params = params.pop("reader", default=Params({}))
    reader = DatasetReader.from_params(reader_params)
    dataset = reader.read(train_path)

    vocabulary_params = params.pop("vocabulary", default=Params({}))
    vocabulary = Vocabulary.from_params(vocabulary_params, instances=dataset)
    vocabulary.save_to_files(vocabulary_path)
예제 #27
0
 def test_can_load_from_params(self) -> None:
     params = Params({
         "type": "feature_reader",
         "token_indexers": {
             "tokens": "single_id"
         }
     })
     reader = DatasetReader.from_params(params)
     assert isinstance(reader._token_indexers["tokens"],
                       SingleIdTokenIndexer)
예제 #28
0
def evaluating(**params):
    param_is_exist(["model_file", "input_file", "include_package"], params)
    for package_name in params["include_package"]:
        import_submodules(package_name)
    cuda_device = params["cuda_device"] if "cuda_device" in params else -1
    overrides = params["overrides"] if "overrides" in params else ""
    weights_file = params["weights_file"] if "weights_file" in params else ""
    archive = load_archive(params["model_file"], cuda_device, overrides,
                           weights_file)
    config = archive.config
    prepare_environment(config)
    model = archive.model
    model.eval()

    # Load the evaluation data

    # Try to use the validation dataset reader if there is one - otherwise fall back
    # to the default dataset_reader used for both training and validation.
    validation_dataset_reader_params = config.pop('validation_dataset_reader',
                                                  None)
    if validation_dataset_reader_params is not None:
        dataset_reader = DatasetReader.from_params(
            validation_dataset_reader_params)
    else:
        dataset_reader = DatasetReader.from_params(
            config.pop('dataset_reader'))
    evaluation_data_path = params["input_file"]
    logger.info("Reading evaluation data from %s", evaluation_data_path)
    instances = dataset_reader.read(evaluation_data_path)

    iterator_params = config.pop("validation_iterator", None)
    if iterator_params is None:
        iterator_params = config.pop("iterator")
    iterator = DataIterator.from_params(iterator_params)
    iterator.index_with(model.vocab)
    metrics = evaluate(model, instances, iterator, cuda_device)
    logger.info("Finished evaluating.")
    logger.info("Metrics:")
    for key, metric in metrics.items():
        logger.info("%s: %s", key, metric)

    return metrics
예제 #29
0
def preprocess(train_path, vocabulary_path, config_path):
    assert os.path.isfile(train_path), "Train dataset file does not exist"
    assert os.path.isfile(config_path), "Config file does not exist"

    params = Params.from_file(config_path)

    reader_params = params.pop("reader", default=Params({}))
    vocabulary_params = params.pop("vocabulary", default=Params({}))

    reader = DatasetReader.from_params(reader_params)
    dataset = reader.read(train_path)

    vocabulary = Vocabulary.from_params(vocabulary_params, instances=dataset)
    vocabulary.save_to_files(vocabulary_path)
예제 #30
0
def create_nabert_drop_reader(config, data_split='dev', lazy=True):
    if data_split == 'dev':
        data_path = "../../data/drop_dataset/drop_dataset_dev.json"
    elif data_split == 'train':
        data_path = "../../data/drop_dataset/drop_dataset_train.json"
    else:
        raise ValueError(
            f'Unsupported datasplit {data_split}, please use "dev" or "train"')

    config_params = config['dataset_reader']

    data_reader = DatasetReader.from_params(config_params)
    data_reader.lazy = lazy
    instances = data_reader.read(data_path)
    return instances
    def _test_model(self, file_name):
        params = self.params[file_name].duplicate()
        reader_params = params.duplicate().pop("reader", default=Params({}))
        if reader_params["type"] == "cnn_dailymail":
            reader_params["cnn_tokenized_dir"] = TEST_STORIES_DIR
            dataset_file = TEST_URLS_FILE
        elif reader_params["type"] == "ria":
            dataset_file = RIA_EXAMPLE_FILE
        else:
            assert False

        reader = DatasetReader.from_params(reader_params)
        tokenizer = reader._tokenizer
        dataset = reader.read(dataset_file)
        vocabulary_params = params.pop("vocabulary", default=Params({}))
        vocabulary = Vocabulary.from_params(vocabulary_params,
                                            instances=dataset)

        model_params = params.pop("model")
        model = Model.from_params(model_params, vocab=vocabulary)
        print(model)
        print("Trainable params count: ",
              sum(p.numel() for p in model.parameters() if p.requires_grad))

        iterator = DataIterator.from_params(params.pop('iterator'))
        iterator.index_with(vocabulary)
        trainer = Trainer.from_params(model, None, iterator, dataset, None,
                                      params.pop('trainer'))
        trainer.train()

        model.eval()
        predictor = Seq2SeqPredictor(model, reader)
        for article, reference_sents in reader.parse_set(dataset_file):
            ref_words = [
                token.text for token in tokenizer.tokenize(reference_sents)
            ]
            decoded_words = predictor.predict(article)["predicted_tokens"]
            self.assertGreaterEqual(len(decoded_words), len(ref_words))
            unk_count = 0
            while DEFAULT_OOV_TOKEN in decoded_words:
                unk_index = decoded_words.index(DEFAULT_OOV_TOKEN)
                decoded_words.pop(unk_index)
                unk_count += 1
                if unk_index < len(ref_words):
                    ref_words.pop(unk_index)
            self.assertLess(unk_count, 5)
            self.assertListEqual(decoded_words[:len(ref_words)], ref_words)
예제 #32
0
def evaluate_from_args(args: argparse.Namespace) -> Dict[str, Any]:
    # Disable some of the more verbose logging statements
    logging.getLogger('allennlp.common.params').disabled = True
    logging.getLogger('allennlp.nn.initializers').disabled = True
    logging.getLogger('allennlp.modules.token_embedders.embedding').setLevel(
        logging.INFO)

    # Load from archive
    torch.cuda.set_device(args.cuda_device)
    archive = load_archive(args.archive_file, args.cuda_device, args.overrides)
    config = archive.config
    prepare_environment(config)
    model = archive.model
    model.cuda(args.cuda_device)
    model.eval()

    # Load the evaluation data
    dataset_reader = DatasetReader.from_params(config.pop('dataset_reader'))
    #dataset_reader.for_training = False
    evaluation_data_path = args.evaluation_data_file
    logger.info("Reading evaluation data from %s", evaluation_data_path)
    dataset = dataset_reader.read(evaluation_data_path)
    dataset.index_instances(model.vocab)

    iterator = DataIterator.from_params(config.pop("iterator"))

    metrics = evaluate(model, dataset, iterator, args.cuda_device)

    if args.print_predictions:
        directory = os.path.dirname(args.archive_file)
        predict_filename = args.print_predictions
        predict_file = open(predict_filename, 'w')
        gold_file = open(os.path.join(directory, 'gold.conll'), 'w')
        predictions = evaluate_predict(model, dataset, iterator,
                                       args.cuda_device, predict_file,
                                       gold_file)

    logger.info("Finished evaluating.")
    logger.info("Metrics:")
    for key, metric in metrics.items():
        logger.info("%s: %s", key, metric)

    return metrics