def test_batch_prediction(self): inputs = [ { "sentence": "What kind of test succeeded on its first attempt?" }, { "sentence": "What kind of test succeeded on its first attempt at batch processing?" }, ] archive = load_archive(self.FIXTURES_ROOT / "biaffine_dependency_parser" / "serialization" / "model.tar.gz") predictor = Predictor.from_archive(archive, "biaffine-dependency-parser") results = predictor.predict_batch_json(inputs) assert len(results) == 2 for result in results: sequence_length = len(result.get("words")) predicted_heads = result.get("predicted_heads") assert len(predicted_heads) == sequence_length predicted_dependencies = result.get("predicted_dependencies") assert len(predicted_dependencies) == sequence_length assert isinstance(predicted_dependencies, list) assert all(isinstance(x, str) for x in predicted_dependencies)
def test_sentence(self, sentence): # Load pre-trained model archive = load_archive('model.tar.gz') # Load predictor and predict the language of the name predictor = Predictor.from_archive(archive, 'event2mind_predictor') result = predictor.predict(sentence) print(result)
def __init__(self, target_namespace: str, span_predictor_model, source_tokenizer: Tokenizer = None, target_tokenizer: Tokenizer = None, source_token_indexers: Dict[str, TokenIndexer] = None, lazy: bool = False, add_rule = True, embed_span = True, add_question = True, add_followup_ques = True, train_using_gold = True)-> None: super().__init__(lazy) self._target_namespace = target_namespace self._source_tokenizer = source_tokenizer or WordTokenizer() self._target_tokenizer = target_tokenizer or self._source_tokenizer self._source_token_indexers = source_token_indexers or {"tokens": SingleIdTokenIndexer()} self.add_rule = add_rule self.embed_span = embed_span self.add_question = add_question self.add_followup_ques = add_followup_ques self.train_using_gold = train_using_gold if "tokens" not in self._source_token_indexers or \ not isinstance(self._source_token_indexers["tokens"], SingleIdTokenIndexer): raise ConfigurationError("CopyNetDatasetReader expects 'source_token_indexers' to contain " "a 'single_id' token indexer called 'tokens'.") self._target_token_indexers: Dict[str, TokenIndexer] = { "tokens": SingleIdTokenIndexer(namespace=self._target_namespace) } archive = load_archive(span_predictor_model) self.dataset_reader = DatasetReader.from_params(archive.config.duplicate()["dataset_reader"]) self.span_predictor = Predictor.from_archive(archive, 'sharc_predictor')
def test_batch_prediction(self): inputs = [ {"sentence": "What a great test sentence."}, {"sentence": "Here's another good, interesting one."}, ] archive = load_archive( FIXTURES_ROOT / "syntax" / "constituency_parser" / "serialization" / "model.tar.gz" ) predictor = Predictor.from_archive(archive, "constituency-parser") results = predictor.predict_batch_json(inputs) result = results[0] assert len(result["spans"]) == 21 # number of possible substrings of the sentence. assert len(result["class_probabilities"]) == 21 assert result["tokens"] == ["What", "a", "great", "test", "sentence", "."] assert isinstance(result["trees"], str) for class_distribution in result["class_probabilities"]: self.assertAlmostEqual(sum(class_distribution), 1.0, places=4) result = results[1] assert len(result["spans"]) == 36 # number of possible substrings of the sentence. assert len(result["class_probabilities"]) == 36 assert result["tokens"] == ["Here", "'s", "another", "good", ",", "interesting", "one", "."] assert isinstance(result["trees"], str) for class_distribution in result["class_probabilities"]: self.assertAlmostEqual(sum(class_distribution), 1.0, places=4)
def test_uses_named_inputs(self): """ Tests whether the model outputs conform to the expected format. """ inputs = { "sentence": "Angela Merkel met and spoke to her EU counterparts during the climate summit." } archive = load_archive(self.FIXTURES_ROOT / \ 'srl' / 'serialization' / 'model.tar.gz') predictor = Predictor.from_archive(archive, 'open-information-extraction') result = predictor.predict_json(inputs) words = result.get("words") assert words == ["Angela", "Merkel", "met", "and", "spoke", "to", "her", "EU", "counterparts", "during", "the", "climate", "summit", "."] num_words = len(words) verbs = result.get("verbs") assert verbs is not None assert isinstance(verbs, list) for verb in verbs: tags = verb.get("tags") assert tags is not None assert isinstance(tags, list) assert all(isinstance(tag, str) for tag in tags) assert len(tags) == num_words
def __init__(self, model_path: str, predictor_type: str, cuda_device: int = -1): self.predictor_type = predictor_type self.predictor = Predictor.from_archive( load_archive(model_path, cuda_device=cuda_device), predictor_type)
def test_model_internals(self): archive = load_archive(FIXTURES_ROOT / "bidaf" / "serialization" / "model.tar.gz") predictor = Predictor.from_archive(archive, "reading_comprehension") inputs = { "question": "What kind of test succeeded on its first attempt?", "passage": "One time I was writing a unit test, and it succeeded on the first attempt.", } # Context manager to capture model internals with predictor.capture_model_internals() as internals: predictor.predict_json(inputs) assert internals is not None assert len(internals) == 24 linear_50_1 = internals[23] print(linear_50_1) assert "Linear(in_features=50, out_features=1, bias=True)" in linear_50_1["name"] assert len(linear_50_1["output"][0]) == 17 assert all(len(a) == 1 for a in linear_50_1["output"][0]) # hooks should be gone for module in predictor._model.modules(): assert not module._forward_hooks
def test_uses_named_inputs(self): inputs = { "question": "What kind of test succeeded on its first attempt?", "passage": "One time I was writing a unit test, and it succeeded on the first attempt.", } archive = load_archive(FIXTURES_ROOT / "bidaf" / "serialization" / "model.tar.gz") predictor = Predictor.from_archive(archive, "reading-comprehension") result = predictor.predict_json(inputs) best_span = result.get("best_span") assert best_span is not None assert isinstance(best_span, list) assert len(best_span) == 2 assert all(isinstance(x, int) for x in best_span) assert best_span[0] <= best_span[1] best_span_str = result.get("best_span_str") assert isinstance(best_span_str, str) assert best_span_str != "" for probs_key in ("span_start_probs", "span_end_probs"): probs = result.get(probs_key) assert probs is not None assert all(isinstance(x, float) for x in probs) assert sum(probs) == approx(1.0)
def test_with_token_characters_indexer(self): inputs = {"sentence": "I always write unit tests for my code."} archive = load_archive(self.FIXTURES_ROOT / "basic_classifier" / "serialization" / "model.tar.gz") predictor = Predictor.from_archive(archive) predictor._dataset_reader._token_indexers[ "chars"] = TokenCharactersIndexer(min_padding_length=1) predictor._model._text_field_embedder._token_embedders[ "chars"] = EmptyEmbedder() hotflipper = Hotflip(predictor) hotflipper.initialize() attack = hotflipper.attack_from_json(inputs, "tokens", "grad_input_1") assert attack is not None assert "final" in attack assert "original" in attack assert "outputs" in attack assert len(attack["final"][0]) == len( attack["original"]) # hotflip replaces words without removing # This checks for a bug that arose with a change in the pytorch API. We want to be sure we # can handle the case where we have to re-encode a vocab item because we didn't save it in # our fake embedding matrix (see Hotflip docstring for more info). hotflipper = Hotflip(predictor, max_tokens=50) hotflipper.initialize() hotflipper._first_order_taylor(grad=torch.rand((10, )).numpy(), token_idx=torch.tensor(60), sign=1)
def test_batch_prediction(self): inputs = [ {u"sentence": u"What a great test sentence."}, {u"sentence": u"Here's another good, interesting one."} ] archive = load_archive(self.FIXTURES_ROOT / u'constituency_parser' / u'serialization' / u'model.tar.gz') predictor = Predictor.from_archive(archive, u'constituency-parser') results = predictor.predict_batch_json(inputs) result = results[0] assert len(result[u"spans"]) == 21 # number of possible substrings of the sentence. assert len(result[u"class_probabilities"]) == 21 assert result[u"tokens"] == [u"What", u"a", u"great", u"test", u"sentence", u"."] assert isinstance(result[u"trees"], unicode) for class_distribution in result[u"class_probabilities"]: self.assertAlmostEqual(sum(class_distribution), 1.0, places=4) result = results[1] assert len(result[u"spans"]) == 36 # number of possible substrings of the sentence. assert len(result[u"class_probabilities"]) == 36 assert result[u"tokens"] == [u"Here", u"'s", u"another", u"good", u",", u"interesting", u"one", u"."] assert isinstance(result[u"trees"], unicode) for class_distribution in result[u"class_probabilities"]: self.assertAlmostEqual(sum(class_distribution), 1.0, places=4)
def test_predictions_to_labeled_instances(self): inputs = { "document": "This is a single string document about a test. Sometimes it " "contains coreferent parts." } archive = load_archive(self.FIXTURES_ROOT / 'coref' / 'serialization' / 'model.tar.gz') predictor = Predictor.from_archive(archive, 'coreference-resolution') instance = predictor._json_to_instance(inputs) outputs = predictor._model.forward_on_instance(instance) new_instances = predictor.predictions_to_labeled_instances( instance, outputs) assert new_instances is not None for new_instance in new_instances: assert 'span_labels' in new_instance assert len(new_instance['span_labels']) == 60 # 7 words in input true_top_spans = set(tuple(span) for span in outputs['top_spans']) pred_clust_spans = set() for i, span in enumerate(outputs['top_spans']): if new_instance['span_labels'][i]: pred_clust_spans.add(tuple(span)) assert true_top_spans == pred_clust_spans
def test_uses_named_inputs(self): inputs = { "document": "This is a single string document about a test. Sometimes it " "contains coreferent parts." } archive = load_archive(self.FIXTURES_ROOT / 'coref' / 'serialization' / 'model.tar.gz') predictor = Predictor.from_archive(archive, 'coreference-resolution') result = predictor.predict_json(inputs) document = result["document"] assert document == [ 'This', 'is', 'a', 'single', 'string', 'document', 'about', 'a', 'test', '.', 'Sometimes', 'it', 'contains', 'coreferent', 'parts', '.' ] clusters = result["clusters"] assert isinstance(clusters, list) for cluster in clusters: assert isinstance(cluster, list) for mention in cluster: # Spans should be integer indices. assert isinstance(mention[0], int) assert isinstance(mention[1], int) # Spans should be inside document. assert 0 < mention[0] <= len(document) assert 0 < mention[1] <= len(document)
def test_uses_named_inputs(self): inputs = { "sentence": "The squirrel wrote a unit test to make sure its nuts worked as designed." } archive = load_archive(FIXTURES_ROOT / "syntax" / "srl" / "serialization" / "model.tar.gz") predictor = Predictor.from_archive(archive, "semantic-role-labeling") result_json = predictor.predict_json(inputs) self.assert_predict_result(result_json) words = [ "The", "squirrel", "wrote", "a", "unit", "test", "to", "make", "sure", "its", "nuts", "worked", "as", "designed", ".", ] result_words = predictor.predict_tokenized(words) self.assert_predict_result(result_words)
def get_predictor(args): archive = load_archive(args.archive_file, weights_file=None, cuda_device=args.cuda_device, overrides="") model_type = archive.config.get("model").get("type") if model_type != 'srl' and model_type != 'coref': raise Exception('the given model must be srl or coref.') if model_type == 'srl': return Predictor.from_archive(archive, 'semantic-role-labeling'), model_type if model_type == 'coref': return Predictor.from_archive(archive, 'coreference-resolution'), model_type
def my_sample_fever(): logger = logging.getLogger() dictConfig({ 'version': 1, 'formatters': { 'default': { 'format': '[%(asctime)s] %(levelname)s in %(module)s: %(message)s', } }, 'handlers': { 'wsgi': { 'class': 'logging.StreamHandler', 'stream': 'ext://sys.stderr', 'formatter': 'default' } }, 'root': { 'level': 'INFO', 'handlers': ['wsgi'] }, 'allennlp': { 'level': 'INFO', 'handlers': ['wsgi'] }, }) logger.info("My sample FEVER application") config = json.load( open(os.getenv("CONFIG_PATH", "configs/predict_docker.json"))) # Create document retrieval model logger.info("Load FEVER Document database from {0}".format( config["database"])) db = FEVERDocumentDatabase(config["database"]) logger.info("Load DrQA Document retrieval index from {0}".format( config['index'])) retrieval_method = RetrievalMethod.by_name("top_docs")(db, config["index"], config["n_docs"], config["n_sents"]) # Load the pre-trained predictor and model from the .tar.gz in the config file. # Override the database location for our model as this now comes from a read-only volume logger.info("Load Model from {0}".format(config['model'])) archive = load_archive(config["model"], cuda_device=config["cuda_device"], overrides='{"dataset_reader":{"database":"' + config["database"] + '" }}') predictor = Predictor.from_archive(archive, predictor_name="fever") # The prediction function that is passed to the web server for FEVER2.0 def baseline_predict(instances): predictions = [] for instance in instances: predictions.append( predict_single(predictor, retrieval_method, instance)) return predictions return fever_web_api(baseline_predict)
def test_batch_prediction(self): inputs = [ {"sentence": "What a great test sentence."}, {"sentence": "Here's another good, interesting one."} ] archive = load_archive(self.FIXTURES_ROOT / 'constituency_parser' / 'serialization' / 'model.tar.gz') predictor = Predictor.from_archive(archive, 'constituency-parser') results = predictor.predict_batch_json(inputs) result = results[0] assert len(result["spans"]) == 21 # number of possible substrings of the sentence. assert len(result["class_probabilities"]) == 21 assert result["tokens"] == ["What", "a", "great", "test", "sentence", "."] assert isinstance(result["trees"], str) for class_distribution in result["class_probabilities"]: self.assertAlmostEqual(sum(class_distribution), 1.0, places=4) result = results[1] assert len(result["spans"]) == 36 # number of possible substrings of the sentence. assert len(result["class_probabilities"]) == 36 assert result["tokens"] == ["Here", "'s", "another", "good", ",", "interesting", "one", "."] assert isinstance(result["trees"], str) for class_distribution in result["class_probabilities"]: self.assertAlmostEqual(sum(class_distribution), 1.0, places=4)
def test_uses_named_inputs(self): inputs = { "question": "What kind of test succeeded on its first attempt?", "passage": "One time I was writing a unit test, and it succeeded on the first attempt." } archive = load_archive(self.FIXTURES_ROOT / 'bidaf' / 'serialization' / 'model.tar.gz') predictor = Predictor.from_archive(archive, 'machine-comprehension') result = predictor.predict_json(inputs) best_span = result.get("best_span") assert best_span is not None assert isinstance(best_span, list) assert len(best_span) == 2 assert all(isinstance(x, int) for x in best_span) assert best_span[0] <= best_span[1] best_span_str = result.get("best_span_str") assert isinstance(best_span_str, str) assert best_span_str != "" for probs_key in ("span_start_probs", "span_end_probs"): probs = result.get(probs_key) assert probs is not None assert all(isinstance(x, float) for x in probs) assert sum(probs) == approx(1.0)
def test_uses_named_inputs(self): inputs = { "sentence": "The squirrel wrote a unit test to make sure its nuts worked as designed." } archive = load_archive(self.FIXTURES_ROOT / 'srl' / 'serialization' / 'model.tar.gz') predictor = Predictor.from_archive(archive, 'semantic-role-labeling') result = predictor.predict_json(inputs) words = result.get("words") assert words == ["The", "squirrel", "wrote", "a", "unit", "test", "to", "make", "sure", "its", "nuts", "worked", "as", "designed", "."] num_words = len(words) verbs = result.get("verbs") assert verbs is not None assert isinstance(verbs, list) assert any(v["verb"] == "wrote" for v in verbs) assert any(v["verb"] == "make" for v in verbs) assert any(v["verb"] == "worked" for v in verbs) for verb in verbs: tags = verb.get("tags") assert tags is not None assert isinstance(tags, list) assert all(isinstance(tag, str) for tag in tags) assert len(tags) == num_words
def test_uses_named_inputs(self): """ Tests whether the model outputs conform to the expected format. """ inputs = { "sentence": "Angela Merkel met and spoke to her EU counterparts during the climate summit." } archive = load_archive(self.FIXTURES_ROOT / \ 'srl' / 'serialization' / 'model.tar.gz') predictor = Predictor.from_archive(archive, 'open-information-extraction') result = predictor.predict_json(inputs) words = result.get("words") assert words == ["Angela", "Merkel", "met", "and", "spoke", "to", "her", "EU", "counterparts", "during", "the", "climate", "summit", "."] num_words = len(words) verbs = result.get("verbs") assert verbs is not None assert isinstance(verbs, list) for verb in verbs: tags = verb.get("tags") assert tags is not None assert isinstance(tags, list) assert all(isinstance(tag, str) for tag in tags) assert len(tags) == num_words
def test_uses_named_inputs(self): inputs = { "premise": "I always write unit tests for my code.", "hypothesis": "One time I didn't write any unit tests for my code." } archive = load_archive(self.FIXTURES_ROOT / 'decomposable_attention' / 'serialization' / 'model.tar.gz') predictor = Predictor.from_archive(archive, 'textual-entailment') result = predictor.predict_json(inputs) # Label probs should be 3 floats that sum to one label_probs = result.get("label_probs") assert label_probs is not None assert isinstance(label_probs, list) assert len(label_probs) == 3 assert all(isinstance(x, float) for x in label_probs) assert all(x >= 0 for x in label_probs) assert sum(label_probs) == approx(1.0) # Logits should be 3 floats that softmax to label_probs label_logits = result.get("label_logits") assert label_logits is not None assert isinstance(label_logits, list) assert len(label_logits) == 3 assert all(isinstance(x, float) for x in label_logits) exps = [math.exp(x) for x in label_logits] sumexps = sum(exps) for e, p in zip(exps, label_probs): assert e / sumexps == approx(p)
def test_batch_prediction(self): inputs = [ { "sentence": "What kind of test succeeded on its first attempt?", }, { "sentence": "What kind of test succeeded on its first attempt at batch processing?", } ] archive = load_archive(self.FIXTURES_ROOT / 'biaffine_dependency_parser' / 'serialization' / 'model.tar.gz') predictor = Predictor.from_archive(archive, 'biaffine-dependency-parser') results = predictor.predict_batch_json(inputs) assert len(results) == 2 for result in results: sequence_length = len(result.get("words")) predicted_heads = result.get("predicted_heads") assert len(predicted_heads) == sequence_length predicted_dependencies = result.get("predicted_dependencies") assert len(predicted_dependencies) == sequence_length assert isinstance(predicted_dependencies, list) assert all(isinstance(x, str) for x in predicted_dependencies)
def test_name(self, name): # Load pre-trained model archive = load_archive('./pre_trained/model.tar.gz') # Load predictor and predict the language of the name predictor = Predictor.from_archive(archive, 'name-predictor') result = predictor.predict(name) print(result)
def test_uses_named_inputs(self): inputs = { "premise": "I always write unit tests for my code.", "hypothesis": "One time I didn't write any unit tests for my code." } archive = load_archive(self.FIXTURES_ROOT / 'decomposable_attention' / 'serialization' / 'model.tar.gz') predictor = Predictor.from_archive(archive, 'textual-entailment') result = predictor.predict_json(inputs) # Label probs should be 3 floats that sum to one label_probs = result.get("label_probs") assert label_probs is not None assert isinstance(label_probs, list) assert len(label_probs) == 3 assert all(isinstance(x, float) for x in label_probs) assert all(x >= 0 for x in label_probs) assert sum(label_probs) == approx(1.0) # Logits should be 3 floats that softmax to label_probs label_logits = result.get("label_logits") assert label_logits is not None assert isinstance(label_logits, list) assert len(label_logits) == 3 assert all(isinstance(x, float) for x in label_logits) exps = [math.exp(x) for x in label_logits] sumexps = sum(exps) for e, p in zip(exps, label_probs): assert e / sumexps == approx(p)
def load_claim_extraction_model(model_path: str = MODEL_PATH, weight_path: str = WEIGHT_PATH): """ Load the Conditional Random field model using allennlp used by titipat in the repo. see: http://github.com/titipata/detecting-scientific-claim :param model_path: location of model, can be downloaded offline or link can be given :param weight_path: location of model weight, can be downloaded offline or link can be given :return: the model using the WEIGHT_PATH specified """ archive = load_archive(model_path) predictor = Predictor.from_archive(archive, 'discourse_crf_predictor') # NOTE(alpha_darklord): We are creating a CRF model based on how allennlp is creating it # , for reference go to: http://github.com/titipata/detecting-scientific-claim model = predictor._model for param in list(model.parameters()): param.requires_grad = False # not to train weights embedding_dim = 300 num_classes, constraints, include_start_end_transitions = 2, None, False model.crf = ConditionalRandomField( num_classes, constraints, include_start_end_transitions=include_start_end_transitions) model.label_projection_layer = TimeDistributed( Linear(2 * embedding_dim, num_classes)) model.load_state_dict( torch.load(cached_path(weight_path), map_location='cpu')) return model
def allennlp( path_to_senteval: str, path_to_allennlp_archive: str, output_filepath: str = None, weights_file: str = None, cuda_device: int = -1, output_dict_field: str = "embeddings", predictor_name: str = None, include_package: List[str] = None, prototyping_config: bool = False, verbose: bool = False, ) -> None: """Evaluates a trained AllenNLP model against the SentEval benchmark.""" from allennlp.models.archival import load_archive from allennlp.predictors import Predictor # SentEval prepare and batcher def prepare(params, samples): return @torch.no_grad() def batcher(params, batch): batch = _cleanup_batch(batch) # Re-tokenize the input text using the tokenizer of the dataset reader inputs = [{"text": " ".join(tokens)} for tokens in batch] outputs = params.predictor.predict_batch_json(inputs) # AllenNLP models return a dictionary, so access the embeddings with the given key. embeddings = [output[output_dict_field] for output in outputs] embeddings = np.vstack(embeddings) return embeddings # Allows us to import custom dataset readers and models that may exist in the AllenNLP archive. # See: https://tinyurl.com/whkmoqh include_package = include_package or [] for package_name in include_package: common_util.import_module_and_submodules(package_name) # Load the archived Model archive = load_archive( path_to_allennlp_archive, cuda_device=cuda_device, weights_file=weights_file, overrides="{'trainer.use_amp': true}", ) predictor = Predictor.from_archive(archive, predictor_name) typer.secho( f'{SUCCESS} Model from AllenNLP archive "{path_to_allennlp_archive}" loaded successfully.', fg=typer.colors.GREEN, bold=True, ) # Performs a few setup steps and returns the SentEval params params_senteval = _setup_senteval(path_to_senteval, prototyping_config, verbose) params_senteval["predictor"] = predictor _run_senteval(params_senteval, path_to_senteval, batcher, prepare, output_filepath) return
def __init__(self, final_candidates_filename, filter_by_corpus, output_df, encoding="utf-8"): self.final_candidates_filename = final_candidates_filename self.encoding = encoding self.output_df = output_df self.filter_by_corpus = filter_by_corpus self.predictor = Predictor.from_archive( load_archive('https://s3-us-west-2.amazonaws.com/allennlp/models/coref-model-2018.02.05.tar.gz', weights_file=None, overrides=""), 'coreference-resolution')
def __init__(self, data_path, output_name, encoding="utf-8"): self.data_path = data_path self.output_name = output_name self.encoding = encoding self.predictor = Predictor.from_archive( load_archive('https://s3-us-west-2.amazonaws.com/allennlp/models/coref-model-2018.02.05.tar.gz', weights_file=None, overrides=""), 'coreference-resolution')
def _get_predictor(args: argparse.Namespace) -> Predictor: check_for_gpu(args.cuda_device) archive = load_archive(args.archive_path, weights_file=args.weights_file, cuda_device=args.cuda_device, overrides=args.overrides) return Predictor.from_archive(archive, args.predictor)
def test_sentence(self, sentence): # Load pre-trained model archive = load_archive('./pre-trained/bert.tar.gz') # Load predictor and predict the language of the name predictor = Predictor.from_archive(archive, 'sentence_classifier_predictor') result = predictor.predict(sentence) print(result)
def get_predictor(args): print(f"Loading Model from {args.archive_file}") archive = load_archive( args.archive_file, cuda_device=args.cuda_device, ) return Predictor.from_archive(archive, predictor_name="base_predictor")
def test_sentence2instance(self): inputs = {"sentence": "我是大哥大"} archive = load_archive('tests/fixture/model.tar.gz') predictor = Predictor.from_archive(archive, 'sentence-segment') result = predictor.predict_json(inputs) print(result)
def setUp(self): super().setUp() importlib.import_module("allennlp_rc.models") archive = load_archive("allennlp_server/tests/fixtures/bidaf/model.tar.gz") self.bidaf_predictor = Predictor.from_archive( archive, "allennlp_rc.predictors.ReadingComprehensionPredictor" )
def load_predictor(model_dir: str, predictor_name: str, cuda_device: int = -1, archive_filename: str = "model.tar.gz", weights_file: Optional[str] = None) -> Predictor: archive_path = join(model_dir, archive_filename) archive = load_archive(archive_path, cuda_device, weights_file) return Predictor.from_archive(archive, predictor_name)
def test_batch_prediction(self): inputs = { "sentence": "The squirrel wrote a unit test to make sure its nuts worked as designed." } archive = load_archive(self.FIXTURES_ROOT / "srl" / "serialization" / "model.tar.gz") predictor = Predictor.from_archive(archive, "semantic-role-labeling") result = predictor.predict_batch_json([inputs, inputs]) assert result[0] == result[1]
def test_batch_prediction(self): inputs = { "sentence": "The squirrel wrote a unit test to make sure its nuts worked as designed." } archive = load_archive(self.FIXTURES_ROOT / 'srl' / 'serialization' / 'model.tar.gz') predictor = Predictor.from_archive(archive, 'semantic-role-labeling') result = predictor.predict_batch_json([inputs, inputs]) assert result[0] == result[1]
def main(args): # Executing this file with no extra options runs the simple service with the bidaf test fixture # and the machine-comprehension predictor. There's no good reason you'd want # to do this, except possibly to test changes to the stock HTML). parser = argparse.ArgumentParser(description='Serve up a simple model') parser.add_argument('--archive-path', type=str, required=True, help='path to trained archive file') parser.add_argument('--predictor', type=str, required=True, help='name of predictor') parser.add_argument('--static-dir', type=str, help='serve index.html from this directory') parser.add_argument('--title', type=str, help='change the default page title', default="AllenNLP Demo") parser.add_argument('--field-name', type=str, required=True, action='append', help='field names to include in the demo') parser.add_argument('--port', type=int, default=8000, help='port to serve the demo on') parser.add_argument('--include-package', type=str, action='append', default=[], help='additional packages to include') args = parser.parse_args(args) # Load modules for package_name in args.include_package: import_submodules(package_name) archive = load_archive(args.archive_path) predictor = Predictor.from_archive(archive, args.predictor) field_names = args.field_name app = make_app(predictor=predictor, field_names=field_names, static_dir=args.static_dir, title=args.title) CORS(app) http_server = WSGIServer(('0.0.0.0', args.port), app) print(f"Model loaded, serving demo on port {args.port}") http_server.serve_forever()
def test_coref_resolved(self): """Tests I/O of coref_resolved method""" document = "This is a test sentence." archive = load_archive(FIXTURES_ROOT / "coref" / "serialization" / "model.tar.gz") predictor = Predictor.from_archive(archive, "coreference-resolution") result = predictor.coref_resolved(document) assert isinstance(result, str)
def test_prediction_with_no_verbs(self): """ Tests whether the model copes with sentences without verbs. """ input1 = {"sentence": "Blah no verb sentence."} archive = load_archive(self.FIXTURES_ROOT / \ 'srl' / 'serialization' / 'model.tar.gz') predictor = Predictor.from_archive(archive, 'open-information-extraction') result = predictor.predict_json(input1) assert result == {'words': ['Blah', 'no', 'verb', 'sentence', '.'], 'verbs': []}
def test_predictor_with_direct_parser(self): archive_dir = self.FIXTURES_ROOT / 'semantic_parsing' / 'nlvr_direct_semantic_parser' / 'serialization' archive = load_archive(os.path.join(archive_dir, 'model.tar.gz')) predictor = Predictor.from_archive(archive, 'nlvr-parser') result = predictor.predict_json(self.inputs) assert 'logical_form' in result assert 'denotations' in result # result['denotations'] is a list corresponding to k-best logical forms, where k is 1 by # default. assert len(result['denotations'][0]) == 2 # Because there are two worlds in the input.
def test_atis_parser_batch_predicted_sql_present(self): inputs = [{ "utterance": "show me flights to seattle", }] archive_path = self.FIXTURES_ROOT / 'semantic_parsing' / 'atis' / 'serialization' / 'model.tar.gz' archive = load_archive(archive_path) predictor = Predictor.from_archive(archive, 'atis-parser') result = predictor.predict_batch_json(inputs) predicted_sql_query = result[0].get("predicted_sql_query") assert predicted_sql_query is not None
def test_copynet_predictions(self): archive = load_archive(self.FIXTURES_ROOT / 'encoder_decoder' / 'copynet_seq2seq' / 'serialization' / 'model.tar.gz') predictor = Predictor.from_archive(archive, 'seq2seq') model = predictor._model end_token = model.vocab.get_token_from_index(model._end_index, model._target_namespace) output_dict = predictor.predict("these tokens should be copied over : hello world") assert len(output_dict["predictions"]) == model._beam_search.beam_size assert len(output_dict["predicted_tokens"]) == model._beam_search.beam_size for predicted_tokens in output_dict["predicted_tokens"]: assert all(isinstance(x, str) for x in predicted_tokens) assert end_token not in predicted_tokens
def test_answer_present_with_batch_predict(self): inputs = [{ "question": "Who is 18 years old?", "table": "Name\tAge\nShallan\t16\nKaladin\t18" }] archive_path = self.FIXTURES_ROOT / 'semantic_parsing' / 'wikitables' / 'serialization' / 'model.tar.gz' archive = load_archive(archive_path) predictor = Predictor.from_archive(archive, 'wikitables-parser') result = predictor.predict_batch_json(inputs) answer = result[0].get("answer") assert answer is not None
def test_prediction_with_no_verbs(self): input1 = {"sentence": "Blah no verb sentence."} archive = load_archive(self.FIXTURES_ROOT / 'srl' / 'serialization' / 'model.tar.gz') predictor = Predictor.from_archive(archive, 'semantic-role-labeling') result = predictor.predict_json(input1) assert result == {'words': ['Blah', 'no', 'verb', 'sentence', '.'], 'verbs': []} input2 = {"sentence": "This sentence has a verb."} results = predictor.predict_batch_json([input1, input2]) assert results[0] == {'words': ['Blah', 'no', 'verb', 'sentence', '.'], 'verbs': []} assert results[1] == {'words': ['This', 'sentence', 'has', 'a', 'verb', '.'], 'verbs': [{'verb': 'has', 'description': 'This sentence has a verb .', 'tags': ['O', 'O', 'O', 'O', 'O', 'O']}]}
def test_uses_named_inputs_with_simple_seq2seq(self): inputs = { "source": "What kind of test succeeded on its first attempt?", } archive = load_archive(self.FIXTURES_ROOT / 'encoder_decoder' / 'simple_seq2seq' / 'serialization' / 'model.tar.gz') predictor = Predictor.from_archive(archive, 'seq2seq') result = predictor.predict_json(inputs) predicted_tokens = result.get("predicted_tokens") assert predicted_tokens is not None assert isinstance(predicted_tokens, list) assert all(isinstance(x, str) for x in predicted_tokens)
def test_uses_named_inputs(self): inputs = {"document": "This is a single string document about a test. Sometimes it " "contains coreferent parts."} archive = load_archive(self.FIXTURES_ROOT / 'coref' / 'serialization' / 'model.tar.gz') predictor = Predictor.from_archive(archive, 'coreference-resolution') result = predictor.predict_json(inputs) self.assert_predict_result(result) document = ['This', 'is', 'a', 'single', 'string', 'document', 'about', 'a', 'test', '.', 'Sometimes', 'it', 'contains', 'coreferent', 'parts', '.'] result_doc_words = predictor.predict_tokenized(document) self.assert_predict_result(result_doc_words)
def test_uses_named_inputs(self): inputs = { "sentence": "The squirrel wrote a unit test to make sure its nuts worked as designed." } archive = load_archive(self.FIXTURES_ROOT / 'srl' / 'serialization' / 'model.tar.gz') predictor = Predictor.from_archive(archive, 'semantic-role-labeling') result_json = predictor.predict_json(inputs) self.assert_predict_result(result_json) words = ["The", "squirrel", "wrote", "a", "unit", "test", "to", "make", "sure", "its", "nuts", "worked", "as", "designed", "."] result_words = predictor.predict_tokenized(words) self.assert_predict_result(result_words)
def test_uses_named_inputs(self): inputs = { "sentence": "What a great test sentence.", } archive = load_archive(self.FIXTURES_ROOT / 'constituency_parser' / 'serialization' / 'model.tar.gz') predictor = Predictor.from_archive(archive, 'constituency-parser') result = predictor.predict_json(inputs) assert len(result["spans"]) == 21 # number of possible substrings of the sentence. assert len(result["class_probabilities"]) == 21 assert result["tokens"] == ["What", "a", "great", "test", "sentence", "."] assert isinstance(result["trees"], str) for class_distribution in result["class_probabilities"]: self.assertAlmostEqual(sum(class_distribution), 1.0, places=4)
def test_predictor_uses_dataset_reader_to_determine_pos_set(self): # pylint: disable=protected-access archive = load_archive(self.FIXTURES_ROOT / 'biaffine_dependency_parser' / 'serialization' / 'model.tar.gz') predictor = Predictor.from_archive(archive, 'biaffine-dependency-parser') inputs = { "sentence": "Dogs eat cats.", } instance_with_ud_pos = predictor._json_to_instance(inputs) tags = instance_with_ud_pos.fields["pos_tags"].labels assert tags == ['NOUN', 'VERB', 'NOUN', 'PUNCT'] predictor._dataset_reader.use_language_specific_pos = True instance_with_ptb_pos = predictor._json_to_instance(inputs) tags = instance_with_ptb_pos.fields["pos_tags"].labels assert tags == ['NNS', 'VBP', 'NNS', '.']
def test_atis_parser_uses_named_inputs(self): inputs = { "utterance": "show me the flights to seattle", } archive_path = self.FIXTURES_ROOT / 'semantic_parsing' / 'atis' / 'serialization' / 'model.tar.gz' archive = load_archive(archive_path) predictor = Predictor.from_archive(archive, 'atis-parser') result = predictor.predict_json(inputs) action_sequence = result.get("best_action_sequence") if action_sequence: # An untrained model will likely get into a loop, and not produce at finished states. # When the model gets into a loop it will not produce any valid SQL, so we don't get # any actions. This basically just tests if the model runs. assert len(action_sequence) > 1 assert all([isinstance(action, str) for action in action_sequence]) predicted_sql_query = result.get("predicted_sql_query") assert predicted_sql_query is not None
def test_uses_named_inputs(self): inputs = {"paragraphs": [{"qas": [{"followup": "y", "yesno": "x", "question": "When was the first one?", "answers": [{"answer_start": 0, "text": "One time"}], "id": "C_q#0"}, {"followup": "n", "yesno": "x", "question": "What were you doing?", "answers": [{"answer_start": 15, "text": "writing a"}], "id": "C_q#1"}, {"followup": "m", "yesno": "y", "question": "How often?", "answers": [{"answer_start": 4, "text": "time I"}], "id": "C_q#2"}], "context": "One time I was writing a unit test,\ and it succeeded on the first attempt."}]} archive = load_archive(self.FIXTURES_ROOT / 'dialog_qa' / 'serialization' / 'model.tar.gz') predictor = Predictor.from_archive(archive, 'dialog_qa') result = predictor.predict_json(inputs) best_span_str_list = result.get("best_span_str") for best_span_str in best_span_str_list: assert isinstance(best_span_str, str) assert best_span_str != ""
def test_answer_present(self): inputs = { 'question': 'Mike was snowboarding on the snow and hit a piece of ice. He went much faster on the ice because _____ is smoother. (A) snow (B) ice', # pylint: disable=line-too-long 'world_literals': {'world1': 'snow', 'world2': 'ice'}, # Added to avoid world tagger 'qrspec': '[smoothness, +speed]', 'entitycues': 'smoothness: smoother\nspeed:faster' } archive_path = self.FIXTURES_ROOT / 'semantic_parsing' / 'quarel' / 'serialization_parser_zeroshot' / 'model.tar.gz' # pylint: disable=line-too-long archive = load_archive(archive_path) predictor = Predictor.from_archive(archive, 'quarel-parser') result = predictor.predict_json(inputs) answer_index = result.get('answer_index') assert answer_index is not None # Check input modality where entity cues are not given del inputs['entitycues'] result = predictor.predict_json(inputs) answer_index = result.get('answer_index') assert answer_index is not None
def test_uses_named_inputs(self): inputs = { "sentence": "Please could you parse this sentence?", } archive = load_archive(self.FIXTURES_ROOT / 'biaffine_dependency_parser' / 'serialization' / 'model.tar.gz') predictor = Predictor.from_archive(archive, 'biaffine-dependency-parser') result = predictor.predict_json(inputs) words = result.get("words") predicted_heads = result.get("predicted_heads") assert len(predicted_heads) == len(words) predicted_dependencies = result.get("predicted_dependencies") assert len(predicted_dependencies) == len(words) assert isinstance(predicted_dependencies, list) assert all(isinstance(x, str) for x in predicted_dependencies) assert result.get("loss") is not None assert result.get("arc_loss") is not None assert result.get("tag_loss") is not None hierplane_tree = result.get("hierplane_tree") hierplane_tree.pop("nodeTypeToStyle") hierplane_tree.pop("linkToPosition") # pylint: disable=line-too-long,bad-continuation assert result.get("hierplane_tree") == {'text': 'Please could you parse this sentence ?', 'root': {'word': 'Please', 'nodeType': 'det', 'attributes': ['INTJ'], 'link': 'det', 'spans': [{'start': 0, 'end': 7}], 'children': [ {'word': 'could', 'nodeType': 'nummod', 'attributes': ['VERB'], 'link': 'nummod', 'spans': [{'start': 7, 'end': 13}]}, {'word': 'you', 'nodeType': 'nummod', 'attributes': ['PRON'], 'link': 'nummod', 'spans': [{'start': 13, 'end': 17}]}, {'word': 'parse', 'nodeType': 'nummod', 'attributes': ['VERB'], 'link': 'nummod', 'spans': [{'start': 17, 'end': 23}]}, {'word': 'this', 'nodeType': 'nummod', 'attributes': ['DET'], 'link': 'nummod', 'spans': [{'start': 23, 'end': 28}]}, {'word': 'sentence', 'nodeType': 'nummod', 'attributes':['NOUN'], 'link': 'nummod', 'spans': [{'start': 28, 'end': 37}]}, {'word': '?', 'nodeType': 'nummod', 'attributes': ['PUNCT'], 'link': 'nummod', 'spans': [{'start': 37, 'end': 39}]} ] } }
def main(args): # Executing this file with no extra options runs the simple service with the bidaf test fixture # and the machine-comprehension predictor. There's no good reason you'd want # to do this, except possibly to test changes to the stock HTML). parser = argparse.ArgumentParser(description='Serve up a simple model') parser.add_argument('--archive-path', type=str, required=True, help='path to trained archive file') parser.add_argument('--predictor', type=str, required=True, help='name of predictor') parser.add_argument('--static-dir', type=str, help='serve index.html from this directory') parser.add_argument('--title', type=str, help='change the default page title', default="AllenNLP Demo") parser.add_argument('--field-name', type=str, required=True, action='append', help='field names to include in the demo') parser.add_argument('--port', type=int, default=8000, help='port to serve the demo on') parser.add_argument('--include-package', type=str, action='append', default=[], help='additional packages to include') args = parser.parse_args(args) # Load modules for package_name in args.include_package: import_submodules(package_name) archive = load_archive(args.archive_path) predictor = Predictor.from_archive(archive, args.predictor) field_names = args.field_name app = make_app(predictor=predictor, field_names=field_names, static_dir=args.static_dir, title=args.title) CORS(app) http_server = WSGIServer(('0.0.0.0', args.port), app) print(f"Model loaded, serving demo on port {args.port}") http_server.serve_forever()
def test_batch_prediction(self): batch_inputs = [ { "premise": "I always write unit tests for my code.", "hypothesis": "One time I didn't write any unit tests for my code." }, { "premise": "I also write batched unit tests for throughput!", "hypothesis": "Batch tests are slower." }, ] archive = load_archive(self.FIXTURES_ROOT / 'decomposable_attention' / 'serialization' / 'model.tar.gz') predictor = Predictor.from_archive(archive, 'textual-entailment') results = predictor.predict_batch_json(batch_inputs) print(results) assert len(results) == 2 for result in results: # Logits should be 3 floats that softmax to label_probs label_logits = result.get("label_logits") # Label probs should be 3 floats that sum to one label_probs = result.get("label_probs") assert label_probs is not None assert isinstance(label_probs, list) assert len(label_probs) == 3 assert all(isinstance(x, float) for x in label_probs) assert all(x >= 0 for x in label_probs) assert sum(label_probs) == approx(1.0) assert label_logits is not None assert isinstance(label_logits, list) assert len(label_logits) == 3 assert all(isinstance(x, float) for x in label_logits) exps = [math.exp(x) for x in label_logits] sumexps = sum(exps) for e, p in zip(exps, label_probs): assert e / sumexps == approx(p)
def test_uses_named_inputs(self): inputs = { "source": "personx gave persony a present", } archive = load_archive(self.FIXTURES_ROOT / 'event2mind' / 'serialization' / 'model.tar.gz') predictor = Predictor.from_archive(archive, 'event2mind') result = predictor.predict_json(inputs) token_names = [ 'xintent_top_k_predicted_tokens', 'xreact_top_k_predicted_tokens', 'oreact_top_k_predicted_tokens' ] for token_name in token_names: all_predicted_tokens = result.get(token_name) for predicted_tokens in all_predicted_tokens: assert isinstance(predicted_tokens, list) assert all(isinstance(x, str) for x in predicted_tokens)
def test_batch_prediction(self): inputs = [{"paragraphs": [{"qas": [{"followup": "y", "yesno": "x", "question": "When was the first one?", "answers": [{"answer_start": 0, "text": "One time"}], "id": "C_q#0"}, {"followup": "n", "yesno": "x", "question": "What were you doing?", "answers": [{"answer_start": 15, "text": "writing a"}], "id": "C_q#1"}, {"followup": "m", "yesno": "y", "question": "How often?", "answers": [{"answer_start": 4, "text": "time I"}], "id": "C_q#2"}], "context": "One time I was writing a unit test,\ and it succeeded on the first attempt."}]}, {"paragraphs": [{"qas": [{"followup": "y", "yesno": "x", "question": "When was the first one?", "answers": [{"answer_start": 0, "text": "One time"}], "id": "C_q#0"}, {"followup": "n", "yesno": "x", "question": "What were you doing?", "answers": [{"answer_start": 15, "text": "writing a"}], "id": "C_q#1"}, {"followup": "m", "yesno": "y", "question": "How often?", "answers": [{"answer_start": 4, "text": "time I"}], "id": "C_q#2"}], "context": "One time I was writing a unit test,\ and it succeeded on the first attempt."}]}] archive = load_archive(self.FIXTURES_ROOT / 'dialog_qa' / 'serialization' / 'model.tar.gz') predictor = Predictor.from_archive(archive, 'dialog_qa') results = predictor.predict_batch_json(inputs) assert len(results) == 2
def test_uses_named_inputs(self): inputs = { "question": "names", "table": "name\tdate\nmatt\t2017\npradeep\t2018" } archive_path = self.FIXTURES_ROOT / 'semantic_parsing' / 'wikitables' / 'serialization' / 'model.tar.gz' archive = load_archive(archive_path) predictor = Predictor.from_archive(archive, 'wikitables-parser') result = predictor.predict_json(inputs) action_sequence = result.get("best_action_sequence") if action_sequence: # We don't currently disallow endless loops in the decoder, and an untrained seq2seq # model will easily get itself into a loop. An endless loop isn't a finished logical # form, so decoding doesn't return any finished states, which means no actions. So, # sadly, we don't have a great test here. This is just testing that the predictor # runs, basically. assert len(action_sequence) > 1 assert all([isinstance(action, str) for action in action_sequence]) logical_form = result.get("logical_form") assert logical_form is not None
def test_uses_named_inputs(self): inputs = {"document": "This is a single string document about a test. Sometimes it " "contains coreferent parts."} archive = load_archive(self.FIXTURES_ROOT / 'coref' / 'serialization' / 'model.tar.gz') predictor = Predictor.from_archive(archive, 'coreference-resolution') result = predictor.predict_json(inputs) document = result["document"] assert document == ['This', 'is', 'a', 'single', 'string', 'document', 'about', 'a', 'test', '.', 'Sometimes', 'it', 'contains', 'coreferent', 'parts', '.'] clusters = result["clusters"] assert isinstance(clusters, list) for cluster in clusters: assert isinstance(cluster, list) for mention in cluster: # Spans should be integer indices. assert isinstance(mention[0], int) assert isinstance(mention[1], int) # Spans should be inside document. assert 0 < mention[0] <= len(document) assert 0 < mention[1] <= len(document)
def test_batch_prediction(self): inputs = [ { "question": "What kind of test succeeded on its first attempt?", "passage": "One time I was writing a unit test, and it succeeded on the first attempt." }, { "question": "What kind of test succeeded on its first attempt at batch processing?", "passage": "One time I was writing a unit test, and it always failed!" } ] archive = load_archive(self.FIXTURES_ROOT / 'bidaf' / 'serialization' / 'model.tar.gz') predictor = Predictor.from_archive(archive, 'machine-comprehension') results = predictor.predict_batch_json(inputs) assert len(results) == 2 for result in results: best_span = result.get("best_span") best_span_str = result.get("best_span_str") start_probs = result.get("span_start_probs") end_probs = result.get("span_end_probs") assert best_span is not None assert isinstance(best_span, list) assert len(best_span) == 2 assert all(isinstance(x, int) for x in best_span) assert best_span[0] <= best_span[1] assert isinstance(best_span_str, str) assert best_span_str != "" for probs in (start_probs, end_probs): assert probs is not None assert all(isinstance(x, float) for x in probs) assert sum(probs) == approx(1.0)
def test_build_hierplane_tree(self): tree = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))") archive = load_archive(self.FIXTURES_ROOT / 'constituency_parser' / 'serialization' / 'model.tar.gz') predictor = Predictor.from_archive(archive, 'constituency-parser') hierplane_tree = predictor._build_hierplane_tree(tree, 0, is_root=True) # pylint: disable=bad-continuation correct_tree = { 'text': 'the dog chased the cat', "linkNameToLabel": LINK_TO_LABEL, "nodeTypeToStyle": NODE_TYPE_TO_STYLE, 'root': { 'word': 'the dog chased the cat', 'nodeType': 'S', 'attributes': ['S'], 'link': 'S', 'children': [{ 'word': 'the dog', 'nodeType': 'NP', 'attributes': ['NP'], 'link': 'NP', 'children': [{ 'word': 'the', 'nodeType': 'D', 'attributes': ['D'], 'link': 'D' }, { 'word': 'dog', 'nodeType': 'N', 'attributes': ['N'], 'link': 'N'} ] }, { 'word': 'chased the cat', 'nodeType': 'VP', 'attributes': ['VP'], 'link': 'VP', 'children': [{ 'word': 'chased', 'nodeType': 'V', 'attributes': ['V'], 'link': 'V' }, { 'word': 'the cat', 'nodeType': 'NP', 'attributes': ['NP'], 'link': 'NP', 'children': [{ 'word': 'the', 'nodeType': 'D', 'attributes': ['D'], 'link': 'D' }, { 'word': 'cat', 'nodeType': 'N', 'attributes': ['N'], 'link': 'N'} ] } ] } ] } } # pylint: enable=bad-continuation assert correct_tree == hierplane_tree