def test_registry_has_builtin_dataset_readers(self): assert DatasetReader.by_name(u'snli').__name__ == u'SnliReader' assert DatasetReader.by_name( u'sequence_tagging').__name__ == u'SequenceTaggingDatasetReader' assert DatasetReader.by_name( u'language_modeling').__name__ == u'LanguageModelingReader' assert DatasetReader.by_name(u'squad').__name__ == u'SquadReader'
def test_registry_has_builtin_dataset_readers(self): assert DatasetReader.by_name("snli").__name__ == "SnliReader" assert DatasetReader.by_name( "sequence_tagging").__name__ == "SequenceTaggingDatasetReader" assert DatasetReader.by_name( "language_modeling").__name__ == "LanguageModelingReader" assert DatasetReader.by_name("squad").__name__ == "SquadReader"
def test_registry_has_builtin_dataset_readers(self): assert DatasetReader.by_name('snli').__name__ == 'SnliReader' assert DatasetReader.by_name( 'sequence_tagging').__name__ == 'SequenceTaggingDatasetReader' assert DatasetReader.by_name( 'language_modeling').__name__ == 'LanguageModelingReader' assert DatasetReader.by_name( 'squad_sentence_selection' ).__name__ == 'SquadSentenceSelectionReader'
def load_predictor(serialization_dir, device): ## Load the model archive = load_archive(join(serialization_dir, 'model.tar.gz')) model = archive.model.eval() if device >= 0: model.to(0) ## Load the dataset reader dataset_reader_params = archive.config.pop('dataset_reader') model_name = archive.config.pop('model')['type'] # Turn off truncation of the inputs if model_name == 'gnli': pass # dataset_reader_params.params['max_premise_length'] = None # dataset_reader_params.params['max_hypothesis_length'] = None elif model_name == 'bertnli': dataset_reader_params.params['max_seq_length'] = None else: raise ValueError() reader = DatasetReader.by_name(dataset_reader_params.pop('type')).from_params(dataset_reader_params) predictor = Predictor(model, reader) return predictor
def main(config: str, model_th: str, dataset: str, hypo_file: str, ref_file: str, batch_size: int, no_gpu: bool): logger = logging.getLogger(__name__) logger.info("Loading configuration parameters") params = Params.from_file(config) vocab_params = params.pop("vocabulary") vocab = Vocabulary.from_params(vocab_params) reader_params = params.pop("dataset_reader") reader_name = reader_params.pop("type") # reader_params["lazy"] = True # make sure we do not load the entire dataset reader = DatasetReader.by_name(reader_name).from_params(reader_params) logger.info("Reading data from {}".format(dataset)) data = reader.read(dataset) iterator = BasicIterator(batch_size=batch_size) iterator.index_with(vocab) batches = iterator._create_batches(data, shuffle=False) logger.info("Loading model") model_params = params.pop("model") model_name = model_params.pop("type") model = Model.by_name(model_name).from_params(model_params, vocab=vocab) if not no_gpu: model.cuda(0) with open(model_th, 'rb') as f: if no_gpu: state_dict = torch.load(f, map_location=torch.device('cpu')) else: state_dict = torch.load(f) model.load_state_dict(state_dict) predictor = Seq2SeqPredictor(model, reader) model.eval() with open(hypo_file, 'w') as hf, open(ref_file, 'w') as rf: logger.info("Generating predictions") for sample in tqdm(batches): s = list(sample) pred = predictor.predict_batch_instance(s) for inst, p in zip(s, pred): print( " ".join(p["predicted_tokens"][0]), file=hf ) print( " ".join(t.text for t in inst["target_tokens"][1:-1]), file=rf )
def main(config: str, model_th: str, dataset: str, out_file): logger = logging.getLogger(__name__) logger.info("Loading model and data") params = Params.from_file(config) vocab_params = params.pop("vocabulary") vocab = Vocabulary.from_params(vocab_params) reader_params = params.pop("dataset_reader") reader_name = reader_params.pop("type") # reader_params["lazy"] = True # make sure we do not load the entire dataset reader = DatasetReader.by_name(reader_name).from_params(reader_params) logger.info("Reading data from {}".format(dataset)) data = reader.read(dataset) iterator = BasicIterator(batch_size=32) iterator.index_with(vocab) batches = iterator._create_batches(data, shuffle=False) logger.info("Loading model") model_params = params.pop("model") model_name = model_params.pop("type") model = Model.by_name(model_name).from_params(model_params, vocab=vocab) model.cuda(0) with open(model_th, 'rb') as f: model.load_state_dict(torch.load(f)) predictor = Seq2SeqPredictor(model, reader) model.eval() flip_trg_lang = { "graph": "text", "text": "graph" } line_id = 0 writer = csv.writer(out_file, delimiter="\t") logger.info("Generating predictions") for sample in tqdm(batches): s = list(sample) pred = predictor.predict_batch_instance(s) for inst, p in zip(s, pred): writer.writerow(( line_id, " ".join(p["predicted_tokens"][0]), flip_trg_lang[inst["target_language"].metadata], " ".join((t.text for t in inst["source_tokens"][1:-1])) )) line_id += 1
def main(config: str, model_th: str, dataset: str, seed: int): logger = logging.getLogger(__name__) logger.info("Loading model and data") params = Params.from_file(config) vocab_params = params.pop("vocabulary") vocab = Vocabulary.from_params(vocab_params) reader_params = params.pop("dataset_reader") reader_name = reader_params.pop("type") reader_params["lazy"] = True # make sure we do not load the entire dataset reader = DatasetReader.by_name(reader_name).from_params(reader_params) data = reader.read(dataset) iterator = BasicIterator(batch_size=10) iterator.index_with(vocab) batches = iterator._create_batches(data, shuffle=False) model_params = params.pop("model") model_name = model_params.pop("type") model = Model.by_name(model_name).from_params(model_params, vocab=vocab) # model.cuda(cuda_device) with open(model_th, 'rb') as f: model.load_state_dict(torch.load(f)) predictor = Seq2SeqPredictor(model, reader) model.eval() logger.info("Generating predictions") random.seed(seed) samples = [] for b in batches: samples.append(b) if random.random() > 0.6: break sample = list(random.choice(samples)) pred = predictor.predict_batch_instance(sample) for inst, p in zip(sample, pred): print() print("SOURCE:", " ".join([t.text for t in inst["source_tokens"]])) print("GOLD:", " ".join([t.text for t in inst["target_tokens"]])) print("GEN:", p["predicted_tokens"])
def test_implicit_include_package(self): # Create a new package in a temporary dir packagedir = self.TEST_DIR / "testpackage" packagedir.mkdir() (packagedir / "__init__.py").touch() # And add that directory to the path with push_python_path(self.TEST_DIR): # Write out a duplicate dataset reader there, but registered under a different name. reader = DatasetReader.by_name("text_classification_json") with open(inspect.getabsfile(reader)) as f: code = f.read().replace( """@DatasetReader.register("text_classification_json")""", """@DatasetReader.register("text_classification_json-fake")""", ) with open(os.path.join(packagedir, "reader.py"), "w") as f: f.write(code) # Fails to import by registered name with pytest.raises(ConfigurationError) as exc: DatasetReader.by_name("text_classification_json-fake") assert "is not a registered name" in str(exc.value) # Fails to import with wrong module name with pytest.raises(ConfigurationError) as exc: DatasetReader.by_name( "testpackage.text_classification_json.TextClassificationJsonReader" ) assert "unable to import module" in str(exc.value) # Fails to import with wrong class name with pytest.raises(ConfigurationError): DatasetReader.by_name("testpackage.reader.FakeReader") assert "unable to find class" in str(exc.value) # Imports successfully with right fully qualified name duplicate_reader = DatasetReader.by_name( "testpackage.reader.TextClassificationJsonReader" ) assert duplicate_reader.__name__ == "TextClassificationJsonReader"
def from_params(cls, params: Params) -> 'MultiCorpusReader': token_indexers_params = params.pop('token_indexers', {}) token_indexers = TokenIndexer.dict_from_params(token_indexers_params) corpus_langmap = params.pop('corpus_langmap', None) logger.info('corpus langmap %s', corpus_langmap) shuffle_corpus = params.pop('shuffle_corpus', True) corpus_readers_params: Dict = params.pop('corpus_readers', {}) corpus_readers = defaultdict() for name, params in corpus_readers_params.items(): params['token_indexers'] = token_indexers_params choice = params.pop_choice('type', DatasetReader.list_available()) corpus_readers[name] = DatasetReader.by_name(choice).from_params( params) # corpus_readers[name] = DatasetReader.from_params(**params) lazy = params.pop('lazy', True) params.assert_empty(cls.__name__) return MultiCorpusReader(token_indexers=token_indexers, corpus_readers=corpus_readers, corpus_langmap=corpus_langmap, shuffle_corpus=shuffle_corpus, lazy=lazy)
def test_registry_has_builtin_dataset_readers(self): assert DatasetReader.by_name('snli').__name__ == 'SnliReader' assert DatasetReader.by_name('sequence_tagging').__name__ == 'SequenceTaggingDatasetReader' assert DatasetReader.by_name('language_modeling').__name__ == 'LanguageModelingReader' assert DatasetReader.by_name('squad').__name__ == 'SquadReader'
def test_suggestions_when_name_not_found(name): with pytest.raises(ConfigurationError) as exc: DatasetReader.by_name(name) assert "did you mean 'sequence_tagging'?" in str(exc.value)
from allennlp.data.dataset_readers.dataset_reader import DatasetReader if __name__ == "__main__": import sys n = (len(sys.argv) >= 3) and sys.argv[2].strip() n = (n and n.isdigit() and int(n)) or 5 fmt = len(sys.argv) >= 4 and sys.argv[3].strip() reader = DatasetReader.by_name(sys.argv[1].strip())(lazy=True) reader.preview(n, fmt)
def debugReader(file_path: Path, main_logger: logging.Logger): reader_name = 'superglue_record' main_logger.info(f"Reading '{file_path}' with reader '{reader_name}'") reader: DatasetReader = DatasetReader.by_name(reader_name)() test = list(reader.read(file_path)) print(f"{len(test)} examples read from {file_path}")
# from jdnlp.dataset def field_tokens(inst, field, fmt): tokens = vars(inst.fields[field])['tokens'] if fmt == "str": return " ".join(str(t) for t in tokens) elif fmt: return [getattr(t, fmt) for t in tokens] return [show_token(t) for t in tokens] from allennlp.data.dataset_readers.dataset_reader import DatasetReader from allennlp.common.util import import_submodules import_submodules('jdnlp') reader = DatasetReader.by_name('convokit_reader') train = reader('conversation_has_personal_attack', max_turns=3, forecast=False, use_cache=False, lazy=True) # trainset = train.read('conversations-gone-awry-corpus') df = train.preview('conversations-gone-awry-corpus_test', n=None) df.head()