def initialize(self, resources: Resources, configs: Config): super().initialize(resources, configs) if configs.tag_formalism not in MODEL2URL: raise ProcessorConfigError('Incorrect value for tag_formalism') if configs.tag_formalism == 'stanford': self.predictor = { 'stanford': Predictor.from_path(MODEL2URL['stanford'])} if 'srl' in configs.processors: self.predictor = { 'stanford': Predictor.from_path(MODEL2URL['stanford']), 'srl': Predictor.from_path(MODEL2URL['srl'])} if configs.overwrite_entries: logger.warning("`overwrite_entries` is set to True, this means " "that the entries of the same type as produced by " "this processor will be overwritten if found.") if configs.allow_parallel_entries: logger.warning('Both `overwrite_entries` (whether to overwrite ' 'the entries of the same type as produced by ' 'this processor) and ' '`allow_parallel_entries` (whether to allow ' 'similar new entries when they already exist) ' 'are True, all existing conflicting entries ' 'will be deleted.') else: if not configs.allow_parallel_entries: logger.warning('Both `overwrite_entries` (whether to overwrite ' 'the entries of the same type as produced by ' 'this processor) and ' '`allow_parallel_entries` (whether to allow ' 'similar new entries when they already exist) ' 'are False, processor will only run if there ' 'are no existing conflicting entries.')
def __init__(self, wd=None, doc=None, debug=False, use_v=False, vfile=None, cfile=None, efile=None): if os.path.exists('/homes/du113/scratch/pretrained'): print('models already downloaded') self.srl_predictor = Predictor.from_path( '/homes/du113/scratch/pretrained/srl-model-2018.05.25.tar.gz') self.coref_predictor = Predictor.from_path( '/homes/du113/scratch/pretrained/coref-model-2018.02.05.tar.gz' ) else: print('downloading models...') self.srl_predictor = Predictor.from_path( "https://s3-us-west-2.amazonaws.com/allennlp/models/srl-model-2018.05.25.tar.gz" ) self.coref_predictor = Predictor.from_path( "https://s3-us-west-2.amazonaws.com/allennlp/models/coref-model-2018.02.05.tar.gz" ) self.word_dict = wd self.doc = doc self.debug = debug self.use_v = use_v '''
def initialize(self, resources: Resources, configs: Config): super().initialize(resources, configs) if ("pos" in configs.processors or "depparse" in configs.processors or "depparse" in configs.processors): if "tokenize" not in self.configs.processors: raise ProcessorConfigError("tokenize is necessary in " "configs.processors for " "pos, depparse or srl") cuda_devices = itertools.cycle(configs["cuda_devices"]) if configs.tag_formalism not in MODEL2URL: raise ProcessorConfigError("Incorrect value for tag_formalism") if configs.tag_formalism == "stanford": self.predictor = { "stanford": Predictor.from_path(configs["stanford_url"], cuda_device=next(cuda_devices)) } if "srl" in configs.processors: self.predictor = { "stanford": Predictor.from_path(configs["stanford_url"], cuda_device=next(cuda_devices)), "srl": Predictor.from_path(configs["srl_url"], cuda_device=next(cuda_devices)), } if configs.overwrite_entries: logger.warning("`overwrite_entries` is set to True, this means " "that the entries of the same type as produced by " "this processor will be overwritten if found.") if configs.allow_parallel_entries: logger.warning("Both `overwrite_entries` (whether to overwrite" " the entries of the same type as produced by " "this processor) and " "`allow_parallel_entries` (whether to allow " "similar new entries when they already exist) " "are True, all existing conflicting entries " "will be deleted.") else: if not configs.allow_parallel_entries: logger.warning("Both `overwrite_entries` (whether to overwrite" " the entries of the same type as produced by " "this processor) and " "`allow_parallel_entries` (whether to allow " "similar new entries when they already exist) " "are False, processor will only run if there " "are no existing conflicting entries.")
def initialize(self, resource: Resources, configs: HParams): self.processors = configs.processors if self.processors is None or self.processors == "": self.processors = self.default_configs()['processors'] if configs.output_format not in MODEL2URL: raise ProcessorConfigError('Incorrect value for output_format') model_url = MODEL2URL[configs.output_format] self.predictor = Predictor.from_path(model_url) self.overwrite_entries = configs.overwrite_entries self.allow_parallel_entries = configs.allow_parallel_entries if self.overwrite_entries: logger.warning("`overwrite_entries` is set to True, this means " "that the entries of the same type as produced by " "this processor will be overwritten if found.") if self.allow_parallel_entries: logger.warning( 'Both `overwrite_entries` (whether to overwrite ' 'the entries of the same type as produced by ' 'this processor) and ' '`allow_parallel_entries` (whether to allow ' 'similar new entries when they already exist) ' 'are True, all existing conflicting entries ' 'will be deleted.') else: if not self.allow_parallel_entries: logger.warning( 'Both `overwrite_entries` (whether to overwrite ' 'the entries of the same type as produced by ' 'this processor) and ' '`allow_parallel_entries` (whether to allow ' 'similar new entries when they already exist) ' 'are False, processor will only run if there ' 'are no existing conflicting entries.')
def allennlp(ques, embd_technique): text = '' if embd_technique == 'word2vec': word2vec_wv = model_manager.load_model(model='word2vec') text = ' '.join([s.strip() for s in main.find_documents_word2vec(ques, word2vec_wv, basic_search=False)]) elif embd_technique == 'doc2vec': docvec_model = model_manager.load_model(model='doc2vec') text = ' '.join([s.strip() for s in main.find_documents_doc2vec(ques, docvec_model, basic_search=False)]) elif embd_technique == 'fasttext': fasttext_wv = model_manager.load_model(model='fasttext') text = ' '.join([s.strip() for s in main.find_documents_fasttext(ques, fasttext_wv, basic_search=False)]) elif embd_technique == 'tfidf': text = ' '.join([s.strip() for s in main.find_documents_tfidf(ques, basic_search=False)]) elif embd_technique == 'glove': text = ' '.join([s.strip() for s in main.find_documents_glove(ques, basic_search=False)]) print('Text:', text) predictor = Predictor.from_path( "https://storage.googleapis.com/allennlp-public-models/bidaf-elmo-model-2020.03.19.tar.gz") prediction = predictor.predict(passage=text, question=ques)['best_span_str'] print('Answer:', prediction) return prediction
def test_constituency_parsing(self): predictor = Predictor.from_path( self.pretrained_models["structured-prediction-constituency-parser"].archive_file ) sentence = """Pierre Vinken died aged 81; immortalised aged 61.""" result = predictor.predict_json({"sentence": sentence}) assert result["tokens"] == [ "Pierre", "Vinken", "died", "aged", "81", ";", "immortalised", "aged", "61", ".", ] assert ( result["trees"] == "(S (NP (NNP Pierre) (NNP Vinken)) (VP (VP (VBD died) (NP (JJ aged) (CD 81))) (, ;) (VP (VBN immortalised) (S (ADJP (JJ aged) (CD 61))))) (. .))" )
def test_textual_entailment(self): predictor = Predictor.from_path( self.pretrained_models["pair-classification-decomposable-attention-elmo"].archive_file ) result = predictor.predict_json( { "premise": "An interplanetary spacecraft is in orbit around a gas giant's icy moon.", "hypothesis": "The spacecraft has the ability to travel between planets.", } ) assert result["label_probs"][0] > 0.7 # entailment result = predictor.predict_json( { "premise": "Two women are wandering along the shore drinking iced tea.", "hypothesis": "Two women are sitting on a blanket near some rocks talking about politics.", } ) assert result["label_probs"][1] > 0.8 # contradiction result = predictor.predict_json( { "premise": "A large, gray elephant walked beside a herd of zebras.", "hypothesis": "The elephant was lost.", } ) assert result["label_probs"][2] > 0.6 # neutral
def __init__(self, ner_model: str, database_db: str, cuda_device: Union[List[int], int] = -1): self._ner_predictor = Predictor.from_path(ner_model) if cuda_device != -1: self._ner_predictor._model = self._ner_predictor._model.to(cuda_device) self._database_db = database_db self._cuda_device = cuda_device
def _convert_deft_folder(input_path: Path, output_file: TextIO, cuda_device: int, with_spacy: bool = True, with_coref: bool = True) -> None: """Convert all files in the given folder.""" if with_spacy: spacy_pipeline = spacy.load('en_core_web_lg') spacy_matcher = Matcher(spacy_pipeline.vocab) for pattern in SPACY_PATTERNS: flag = pattern['pattern_flag'] Token.set_extension(flag, default=False, force=True) patterns = pattern['patterns'] callback_fn = _get_extension_labeling_fn(flag) spacy_matcher.add(flag, callback_fn, *patterns) else: spacy_pipeline = None spacy_matcher = None if with_coref: coref_predictor = Predictor.from_path( archive_path="https://s3-us-west-2.amazonaws.com/allennlp/models/coref-model-2018.02.05.tar.gz", cuda_device=cuda_device ) # Fix issues with characters tokens smaller than the biggest convolution size coref_predictor._dataset_reader._token_indexers['token_characters']._min_padding_length = 5 else: coref_predictor = None for input_file in tqdm(input_path.iterdir()): examples = _convert_deft_file(input_file, spacy_pipeline=spacy_pipeline, spacy_matcher=spacy_matcher, coref_predictor=coref_predictor) for example in examples: output_file.write(json.dumps(example) + '\n')
def __init__(self, model_dir_path, cuda_device=-1): SimpleAllenNLPClassifier.__init__(self, model_dir_path, cuda_device) self._model = Predictor.from_path(os.path.join(self.model_dir_path, 'model.tar.gz'), predictor_name='textual-entailment', cuda_device=self._cuda_device)
def bidaf(tkt): sp3 = """EXEC [dbo].[SPCRM] @TicketNumber=?""" sp5 = """EXEC [dbo].[SPInvVref_hs_v2] @Inv=?""" params = tkt cursor.execute(sp3, params) column_names_list = [x[0] for x in cursor.description] result_dicts = [ dict(zip(column_names_list, row)) for row in cursor.fetchall() ] bidaf_df = pd.DataFrame(result_dicts) masterinv = defaultdict(list) model = Predictor.from_path("bidaf.tar.gz") question1 = "what is the invoice number?" #question2 = "what is the customer number?" #question3 = "what is the reference number?" invref = pd.DataFrame() for i, row in bidaf_df.iterrows(): #print(row) inv_no1 = model.predict(question1, row['description'])["best_span_str"] #print(inv_no1) #inv_no2=model.predict(question2,row['description'])["best_span_str"] #inv_no3=model.predict(question3,row['description'])["best_span_str"] cursor.execute(sp5, inv_no1) column_names_list = [x[0] for x in cursor.description] result_dicts = [ dict(zip(column_names_list, row)) for row in cursor.fetchall() ] #print(result_dicts) invref = pd.DataFrame(result_dicts) return invref
def setUp(self): self.allens = { # TODO: Current download model is wrong on Allennlp. # 'universal': Predictor.from_path(MODEL2URL['universal']), 'stanford': Predictor.from_path(MODEL2URL['stanford']) } self.results = {} for k in self.allens: self.results[k] = {} sentences = [ "This tool is called Forte.", "The goal of this project to help you build NLP pipelines.", "NLP has never been made this easy before.", "Forte is named Forte because it is designed for text." ] self.document = ' '.join(sentences) for k in self.allens: self.results[k]['tokens'] = [] self.results[k]['pos'] = [] self.results[k]['dep_types'] = [] self.results[k]['dep_heads'] = [] for dep_type in self.allens.keys(): for sent in sentences: results = self.allens[dep_type].predict(sentence=sent) self.results[dep_type]['tokens'].append(results['words']) self.results[dep_type]['pos'].append(results['pos']) self.results[dep_type]['dep_types'].append( results['predicted_dependencies']) self.results[dep_type]['dep_heads'].append( results['predicted_heads'])
def setUp(self): self.allens = { # TODO: Current download model is wrong on Allennlp. # 'universal': Predictor.from_path(MODEL2URL['universal']), "stanford": Predictor.from_path(MODEL2URL["stanford"]) } self.results = {} for k in self.allens: self.results[k] = {} self.results["srl"] = {} sentences = [ "This tool is called Forte.", "The goal of this project is to help you build NLP pipelines.", "NLP has never been made this easy before.", "Forte is named Forte because it is designed for text.", ] self.document = " ".join(sentences) for k in self.allens: self.results[k]["tokens"] = [] self.results[k]["pos"] = [] self.results[k]["dep_types"] = [] self.results[k]["dep_heads"] = [] self.results["srl"]["verbs"] = [] self.results["srl"]["srl_tags"] = [] for sent in sentences: for dep_type in self.allens.keys(): results = self.allens[dep_type].predict( # type: ignore sentence=sent ) self.results[dep_type]["tokens"].append(results["words"]) self.results[dep_type]["pos"].append(results["pos"]) self.results[dep_type]["dep_types"].append( results["predicted_dependencies"] ) self.results[dep_type]["dep_heads"].append( results["predicted_heads"] ) srl_predictor = Predictor.from_path(MODEL2URL["srl"]) srl_results = parse_allennlp_srl_results( srl_predictor.predict(sentence=sent)["verbs"] ) self.results["srl"]["verbs"].append(srl_results["verbs"]) self.results["srl"]["srl_tags"].append(srl_results["srl_tags"])
def __init__(self, model_path): # Note: SciBERT is imported but unused. This is because the import has # a side-effect of registering the SciBERT model, which we use later on. import scibert # NOQA from allennlp.predictors import Predictor self.model_ = Predictor.from_path(model_path, predictor_name="text_classifier")
def _get_predictor(args: argparse.Namespace) -> Predictor: check_for_gpu(args.cuda_device) return Predictor.from_path( args.archive_path, predictor_name=args.predictor, cuda_device=args.cuda_device, overrides=args.overrides, )
def __init__(self, database_path, add_claim=False, k_wiki_results=None): self.db = FeverDocDB(database_path) self.add_claim = add_claim self.k_wiki_results = k_wiki_results self.proter_stemm = nltk.PorterStemmer() self.tokenizer = nltk.word_tokenize self.predictor = Predictor.from_path( "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo-constituency-parser-2018.03.14.tar.gz")
def setUp(self): self.allens = { 'universal_dependencies': Predictor.from_path(MODEL2URL['universal_dependencies']), 'stanford_dependencies': Predictor.from_path(MODEL2URL['stanford_dependencies']) } univ = Predictor.from_path(MODEL2URL['universal_dependencies']) stan = Predictor.from_path(MODEL2URL['stanford_dependencies']) sentences = [ "This tool is called Forte.", "The goal of this project to help you build NLP pipelines.", "NLP has never been made this easy before." ] self.document = ' '.join(sentences) self.tokens = [] self.pos = {'stanford_dependencies': [], 'universal_dependencies': []} self.dep_types = { 'stanford_dependencies': [], 'universal_dependencies': [] } self.dep_heads = { 'stanford_dependencies': [], 'universal_dependencies': [] } for sent in sentences: univ_results = univ.predict(sentence=sent) stan_results = stan.predict(sentence=sent) self.tokens.append(univ_results['words']) self.pos['universal_dependencies'].append(univ_results['pos']) self.dep_types['universal_dependencies'].append( univ_results['predicted_dependencies']) self.dep_heads['universal_dependencies'].append( univ_results['predicted_heads']) self.pos['stanford_dependencies'].append(stan_results['pos']) self.dep_types['stanford_dependencies'].append( stan_results['predicted_dependencies']) self.dep_heads['stanford_dependencies'].append( stan_results['predicted_heads'])
def __init__(self): #use the model from allennlp for simlicity. self.predictor = Predictor.from_path( "https://s3-us-west-2.amazonaws.com/allennlp/models/srl-model-2018.05.25.tar.gz" ) if torch.cuda.is_available(): self.predictor._model = self.predictor._model.cuda() else: self.predictor._model = self.predictor._model
def make(cls, path, gpu=False): try: from allennlp_models.rc.models.bidaf import BidirectionalAttentionFlow except: import allennlp_models.rc cuda_device = 0 if gpu else -1 return cls("BiDAF", Predictor.from_path(path, cuda_device=cuda_device), gpu=gpu)
def test_openie(self): predictor = Predictor.from_path( self.pretrained_models["structured-prediction-srl"].archive_file ) result = predictor.predict_json( {"sentence": "I'm against picketing, but I don't know how to show it."} ) assert "verbs" in result assert "words" in result
def __init__(self, model_dir_path, cuda_device=-1): self._model_path = os.path.join(model_dir_path, 'segmenter_neural', 'model.tar.gz') self._cuda_device = cuda_device self.predictor = Predictor.from_path(self._model_path, cuda_device=self._cuda_device) self.predictor._tokenizer = WhitespaceTokenizer() self._separator = 'U-S' self._threshold = 0.5 self._use_logits = False self._symbol_map = SYMBOL_MAP
def __init__(self): # ALLEN NLP Corereference pre-trained model pretrained_coref_path = './allennlp_pretrained/allennlp_coref-model-2018.02.05.tar.gz' if not os.path.exists(pretrained_coref_path): coref_url = "https://s3-us-west-2.amazonaws.com/allennlp/models/coref-model-2018.02.05.tar.gz" os.mkdir('allennlp_pretrained') wget.download(coref_url, pretrained_coref_path) self.predictor = Predictor.from_path(pretrained_coref_path)
def load_model(slug): """Return an AllenNLP Predictor for the trained model for `slug`.""" assert is_trained(slug), "We haven't trained a model to load yet" model_file_name = os.path.join(SERIALIZATION_DIR, slug, "model.tar.gz") PREDICTOR_NAMES = { "pos-tagging": "sentence-tagger", "translation": "seq2seq", "classification": "text_classifier", } name = PREDICTOR_NAMES[slug] return Predictor.from_path(model_file_name, predictor_name=name)
class SRL: # predictor = Predictor.from_path("/root/.allennlp/models/srl-model-2018.05.25.tar.gz") predictor = Predictor.from_path("./models/srl-model-2018.05.25.tar.gz") @staticmethod def get_srl(document): if SRL.validate_doc(document): return SRL.predictor.predict(document) @staticmethod def validate_doc(document): return True
def start_bundle(self): if self.predictor is not None: return model_dir = self.prepare_model() # the following line is a necessary bad import practice, otherwise beam tries to serialize allennlp and the # deserialization breaks on dataflow. from scibert.models import text_classifier from scibert.predictors.predictor import ScibertPredictor from allennlp.predictors import Predictor import scibert self.predictor = Predictor.from_path(model_dir, predictor_name="text_classifier")
class SRL: predictor = Predictor.from_path("./models/srl-model-2018.05.25.tar.gz") @staticmethod def get_srl(document): if SRL.validate_doc(document): return SRL.predictor.predict(document) @staticmethod def validate_doc(document): """ Any validations should be carried out here """ return True
def my_sample_fever(): logger = logging.getLogger() dictConfig({ 'version': 1, 'formatters': { 'default': { 'format': '[%(asctime)s] %(levelname)s in %(module)s: %(message)s', } }, 'handlers': { 'wsgi': { 'class': 'logging.StreamHandler', 'stream': 'ext://sys.stderr', 'formatter': 'default' } }, 'root': { 'level': 'INFO', 'handlers': ['wsgi'] }, 'allennlp': { 'level': 'INFO', 'handlers': ['wsgi'] }, }) logger.info("Columbia FEVER application") config = json.load( open(os.getenv("CONFIG_PATH", "configs/system_config.json"))) ner_predictor = Predictor.from_path( "https://s3-us-west-2.amazonaws.com/allennlp/models/fine-grained-ner-model-elmo-2018.12.21.tar.gz" ) google_config = GoogleConfig(**config['retrieval']['google']) ranker = retriever.get_class('tfidf')( tfidf_path=config['retrieval']['tfidf']['index']) predictors = {} for key in ('page_model', 'state_model'): path = config[key].pop('path') predictors[key] = ColumbiaPredictor(path, config['cuda_device'], **config[key]) # The prediction function that is passed to the web server for FEVER2.0 def predict(instances): predictions = getDocsSingle(instances, google_config, ner_predictor, ranker) for key in ('page_model', 'state_model'): predictions = list(predictors[key].predict(predictions)) return predictions return fever_web_api(predict)
def setup_route(app, route, model): allennlp_model = Predictor.from_path(model) @app.route(f'/{route}', methods=['POST']) def serve_route(): data = request.get_json() if data is None: return jsonify({}) text = data.get('text', None) if text is None: return jsonify({}) return jsonify({'results': allennlp_model.predict(sentence=text)})
def test_machine_comprehension(self): predictor = Predictor.from_path(self.pretrained_models["rc-bidaf"].archive_file) passage = """The Matrix is a 1999 science fiction action film written and directed by The Wachowskis, starring Keanu Reeves, Laurence Fishburne, Carrie-Anne Moss, Hugo Weaving, and Joe Pantoliano. It depicts a dystopian future in which reality as perceived by most humans is actually a simulated reality called "the Matrix", created by sentient machines to subdue the human population, while their bodies' heat and electrical activity are used as an energy source. Computer programmer Neo" learns this truth and is drawn into a rebellion against the machines, which involves other people who have been freed from the "dream world". """ question = "Who stars in The Matrix?" result = predictor.predict_json({"passage": passage, "question": question}) correct = ( "Keanu Reeves, Laurence Fishburne, Carrie-Anne Moss, Hugo Weaving, and Joe Pantoliano" ) assert correct == result["best_span_str"]
def __init__(self, database_path, add_claim=False, k_wiki_results=None): self.add_claim = add_claim self.k_wiki_results = k_wiki_results self.proter_stemm = nltk.PorterStemmer() self.tokenizer = nltk.word_tokenize self.predictor = Predictor.from_path( "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo-constituency-parser-2018.03.14.tar.gz") self.db = {} for file in os.listdir(database_path): with open(os.path.join(database_path, file), 'r', encoding='utf-8') as f: for line in f: line_json = json.loads(line.strip()) self.db[line_json['id']] = line_json['lines']