Esempio n. 1
0
 def test_registry_has_builtin_dataset_readers(self):
     assert DatasetReader.by_name(u'snli').__name__ == u'SnliReader'
     assert DatasetReader.by_name(
         u'sequence_tagging').__name__ == u'SequenceTaggingDatasetReader'
     assert DatasetReader.by_name(
         u'language_modeling').__name__ == u'LanguageModelingReader'
     assert DatasetReader.by_name(u'squad').__name__ == u'SquadReader'
Esempio n. 2
0
 def test_registry_has_builtin_dataset_readers(self):
     assert DatasetReader.by_name("snli").__name__ == "SnliReader"
     assert DatasetReader.by_name(
         "sequence_tagging").__name__ == "SequenceTaggingDatasetReader"
     assert DatasetReader.by_name(
         "language_modeling").__name__ == "LanguageModelingReader"
     assert DatasetReader.by_name("squad").__name__ == "SquadReader"
Esempio n. 3
0
 def test_registry_has_builtin_dataset_readers(self):
     assert DatasetReader.by_name('snli').__name__ == 'SnliReader'
     assert DatasetReader.by_name(
         'sequence_tagging').__name__ == 'SequenceTaggingDatasetReader'
     assert DatasetReader.by_name(
         'language_modeling').__name__ == 'LanguageModelingReader'
     assert DatasetReader.by_name(
         'squad_sentence_selection'
     ).__name__ == 'SquadSentenceSelectionReader'
def load_predictor(serialization_dir, device):
	## Load the model
	archive = load_archive(join(serialization_dir, 'model.tar.gz'))
	model = archive.model.eval()
	if device >= 0: 
		model.to(0)

	## Load the dataset reader
	dataset_reader_params = archive.config.pop('dataset_reader')
	model_name = archive.config.pop('model')['type']

	# Turn off truncation of the inputs
	if model_name == 'gnli':
		pass
		# dataset_reader_params.params['max_premise_length'] = None
		# dataset_reader_params.params['max_hypothesis_length'] = None
	elif model_name == 'bertnli':
		dataset_reader_params.params['max_seq_length'] = None
	else:
		raise ValueError()

	reader = DatasetReader.by_name(dataset_reader_params.pop('type')).from_params(dataset_reader_params)

	predictor = Predictor(model, reader)
	return predictor
Esempio n. 5
0
def main(config: str, model_th: str, dataset: str, hypo_file: str, ref_file: str,
         batch_size: int, no_gpu: bool):
    logger = logging.getLogger(__name__)

    logger.info("Loading configuration parameters")
    params = Params.from_file(config)

    vocab_params = params.pop("vocabulary")
    vocab = Vocabulary.from_params(vocab_params)

    reader_params = params.pop("dataset_reader")
    reader_name = reader_params.pop("type")
    # reader_params["lazy"] = True  # make sure we do not load the entire dataset

    reader = DatasetReader.by_name(reader_name).from_params(reader_params)

    logger.info("Reading data from {}".format(dataset))
    data = reader.read(dataset)

    iterator = BasicIterator(batch_size=batch_size)
    iterator.index_with(vocab)
    batches = iterator._create_batches(data, shuffle=False)

    logger.info("Loading model")
    model_params = params.pop("model")
    model_name = model_params.pop("type")
    model = Model.by_name(model_name).from_params(model_params, vocab=vocab)
    if not no_gpu:
        model.cuda(0)

    with open(model_th, 'rb') as f:
        if no_gpu:
            state_dict = torch.load(f, map_location=torch.device('cpu'))
        else:
            state_dict = torch.load(f)

    model.load_state_dict(state_dict)

    predictor = Seq2SeqPredictor(model, reader)
    model.eval()

    with open(hypo_file, 'w') as hf, open(ref_file, 'w') as rf:
        logger.info("Generating predictions")
        for sample in tqdm(batches):
            s = list(sample)
            pred = predictor.predict_batch_instance(s)

            for inst, p in zip(s, pred):
                print(
                    " ".join(p["predicted_tokens"][0]),
                    file=hf
                )
                print(
                    " ".join(t.text for t in inst["target_tokens"][1:-1]),
                    file=rf
                )
Esempio n. 6
0
def main(config: str, model_th: str, dataset: str, out_file):
    logger = logging.getLogger(__name__)

    logger.info("Loading model and data")
    params = Params.from_file(config)

    vocab_params = params.pop("vocabulary")
    vocab = Vocabulary.from_params(vocab_params)

    reader_params = params.pop("dataset_reader")
    reader_name = reader_params.pop("type")
    # reader_params["lazy"] = True  # make sure we do not load the entire dataset

    reader = DatasetReader.by_name(reader_name).from_params(reader_params)

    logger.info("Reading data from {}".format(dataset))
    data = reader.read(dataset)

    iterator = BasicIterator(batch_size=32)
    iterator.index_with(vocab)
    batches = iterator._create_batches(data, shuffle=False)

    logger.info("Loading model")
    model_params = params.pop("model")
    model_name = model_params.pop("type")
    model = Model.by_name(model_name).from_params(model_params, vocab=vocab)
    model.cuda(0)

    with open(model_th, 'rb') as f:
        model.load_state_dict(torch.load(f))

    predictor = Seq2SeqPredictor(model, reader)
    model.eval()

    flip_trg_lang = {
        "graph": "text",
        "text": "graph"
    }

    line_id = 0
    writer = csv.writer(out_file, delimiter="\t")
    logger.info("Generating predictions")
    for sample in tqdm(batches):
        s = list(sample)
        pred = predictor.predict_batch_instance(s)

        for inst, p in zip(s, pred):
            writer.writerow((
                line_id,
                " ".join(p["predicted_tokens"][0]),
                flip_trg_lang[inst["target_language"].metadata],
                " ".join((t.text for t in inst["source_tokens"][1:-1]))
            ))
            line_id += 1
Esempio n. 7
0
def main(config: str, model_th: str, dataset: str, seed: int):
    logger = logging.getLogger(__name__)

    logger.info("Loading model and data")
    params = Params.from_file(config)

    vocab_params = params.pop("vocabulary")
    vocab = Vocabulary.from_params(vocab_params)

    reader_params = params.pop("dataset_reader")
    reader_name = reader_params.pop("type")
    reader_params["lazy"] = True  # make sure we do not load the entire dataset
    reader = DatasetReader.by_name(reader_name).from_params(reader_params)

    data = reader.read(dataset)

    iterator = BasicIterator(batch_size=10)
    iterator.index_with(vocab)

    batches = iterator._create_batches(data, shuffle=False)

    model_params = params.pop("model")
    model_name = model_params.pop("type")
    model = Model.by_name(model_name).from_params(model_params, vocab=vocab)
    # model.cuda(cuda_device)

    with open(model_th, 'rb') as f:
        model.load_state_dict(torch.load(f))

    predictor = Seq2SeqPredictor(model, reader)
    model.eval()

    logger.info("Generating predictions")

    random.seed(seed)
    samples = []
    for b in batches:
        samples.append(b)
        if random.random() > 0.6:
            break

    sample = list(random.choice(samples))
    pred = predictor.predict_batch_instance(sample)

    for inst, p in zip(sample, pred):
        print()
        print("SOURCE:", " ".join([t.text for t in inst["source_tokens"]]))
        print("GOLD:", " ".join([t.text for t in inst["target_tokens"]]))
        print("GEN:", p["predicted_tokens"])
Esempio n. 8
0
    def test_implicit_include_package(self):
        # Create a new package in a temporary dir
        packagedir = self.TEST_DIR / "testpackage"
        packagedir.mkdir()
        (packagedir / "__init__.py").touch()

        # And add that directory to the path
        with push_python_path(self.TEST_DIR):
            # Write out a duplicate dataset reader there, but registered under a different name.
            reader = DatasetReader.by_name("text_classification_json")

            with open(inspect.getabsfile(reader)) as f:
                code = f.read().replace(
                    """@DatasetReader.register("text_classification_json")""",
                    """@DatasetReader.register("text_classification_json-fake")""",
                )

            with open(os.path.join(packagedir, "reader.py"), "w") as f:
                f.write(code)

            # Fails to import by registered name
            with pytest.raises(ConfigurationError) as exc:
                DatasetReader.by_name("text_classification_json-fake")
                assert "is not a registered name" in str(exc.value)

            # Fails to import with wrong module name
            with pytest.raises(ConfigurationError) as exc:
                DatasetReader.by_name(
                    "testpackage.text_classification_json.TextClassificationJsonReader"
                )
                assert "unable to import module" in str(exc.value)

            # Fails to import with wrong class name
            with pytest.raises(ConfigurationError):
                DatasetReader.by_name("testpackage.reader.FakeReader")
                assert "unable to find class" in str(exc.value)

            # Imports successfully with right fully qualified name
            duplicate_reader = DatasetReader.by_name(
                "testpackage.reader.TextClassificationJsonReader"
            )
            assert duplicate_reader.__name__ == "TextClassificationJsonReader"
Esempio n. 9
0
 def from_params(cls, params: Params) -> 'MultiCorpusReader':
     token_indexers_params = params.pop('token_indexers', {})
     token_indexers = TokenIndexer.dict_from_params(token_indexers_params)
     corpus_langmap = params.pop('corpus_langmap', None)
     logger.info('corpus langmap %s', corpus_langmap)
     shuffle_corpus = params.pop('shuffle_corpus', True)
     corpus_readers_params: Dict = params.pop('corpus_readers', {})
     corpus_readers = defaultdict()
     for name, params in corpus_readers_params.items():
         params['token_indexers'] = token_indexers_params
         choice = params.pop_choice('type', DatasetReader.list_available())
         corpus_readers[name] = DatasetReader.by_name(choice).from_params(
             params)
         # corpus_readers[name] = DatasetReader.from_params(**params)
     lazy = params.pop('lazy', True)
     params.assert_empty(cls.__name__)
     return MultiCorpusReader(token_indexers=token_indexers,
                              corpus_readers=corpus_readers,
                              corpus_langmap=corpus_langmap,
                              shuffle_corpus=shuffle_corpus,
                              lazy=lazy)
Esempio n. 10
0
 def test_registry_has_builtin_dataset_readers(self):
     assert DatasetReader.by_name('snli').__name__ == 'SnliReader'
     assert DatasetReader.by_name('sequence_tagging').__name__ == 'SequenceTaggingDatasetReader'
     assert DatasetReader.by_name('language_modeling').__name__ == 'LanguageModelingReader'
     assert DatasetReader.by_name('squad').__name__ == 'SquadReader'
Esempio n. 11
0
def test_suggestions_when_name_not_found(name):
    with pytest.raises(ConfigurationError) as exc:
        DatasetReader.by_name(name)
        assert "did you mean 'sequence_tagging'?" in str(exc.value)
Esempio n. 12
0
from allennlp.data.dataset_readers.dataset_reader import DatasetReader

if __name__ == "__main__":
    import sys

    n = (len(sys.argv) >= 3) and sys.argv[2].strip()
    n = (n and n.isdigit() and int(n)) or 5

    fmt = len(sys.argv) >= 4 and sys.argv[3].strip()

    reader = DatasetReader.by_name(sys.argv[1].strip())(lazy=True)
    reader.preview(n, fmt)
Esempio n. 13
0
def debugReader(file_path: Path, main_logger: logging.Logger):
    reader_name = 'superglue_record'
    main_logger.info(f"Reading '{file_path}' with reader '{reader_name}'")
    reader: DatasetReader = DatasetReader.by_name(reader_name)()
    test = list(reader.read(file_path))
    print(f"{len(test)} examples read from {file_path}")
Esempio n. 14
0
# from jdnlp.dataset


def field_tokens(inst, field, fmt):
    tokens = vars(inst.fields[field])['tokens']
    if fmt == "str":
        return " ".join(str(t) for t in tokens)
    elif fmt:
        return [getattr(t, fmt) for t in tokens]

    return [show_token(t) for t in tokens]


from allennlp.data.dataset_readers.dataset_reader import DatasetReader
from allennlp.common.util import import_submodules

import_submodules('jdnlp')

reader = DatasetReader.by_name('convokit_reader')
train = reader('conversation_has_personal_attack',
               max_turns=3,
               forecast=False,
               use_cache=False,
               lazy=True)
# trainset = train.read('conversations-gone-awry-corpus')
df = train.preview('conversations-gone-awry-corpus_test', n=None)
df.head()