def _get_predictor(args: argparse.Namespace) -> Predictor: check_for_gpu(args.cuda_device) archive = load_archive(args.archive_file, weights_file=args.weights_file, cuda_device=args.cuda_device, overrides=args.overrides) return Predictor.from_archive(archive, args.predictor)
from allennlp.predictors.predictor import Predictor predictor = Predictor.from_path( "https://s3-us-west-2.amazonaws.com/allennlp/models/bidaf-model-2017.09.15-charpad.tar.gz" ) print() print() print("================= MODEL OUTPUTS =================") # json_result = predictor.predict( # passage="Robotics is an interdisciplinary branch of engineering and science that includes mechanical engineering, electrical engineering, computer science, and others. Robotics deals with the design, construction, operation, and use of robots, as well as computer systems for their control, sensory feedback, and information processing. These technologies are used to develop machines that can substitute for humans. Robots can be used in any situation and for any purpose, but today many are used in dangerous environments (including bomb detection and de-activation), manufacturing processes, or where humans cannot survive. Robots can take on any form but some are made to resemble humans in appearance. This is said to help in the acceptance of a robot in certain replicative behaviors usually performed by people. Such robots attempt to replicate walking, lifting, speech, cognition, and basically anything a human can do.", # question="What is Robotics?" # ) json_result = predictor.predict( question="What do robots that resemble humans attempt to do?") print() print("ANSWER : " + json_result["best_span_str"]) print() print("================ END OF OUTPUTS =================") print()
from allennlp.predictors.predictor import Predictor import allennlp_models.tagging from decorators import * @timer def predict(predictor): prediction = predictor.predict( sentence= "Hong Kong, Seoul, Korea – January 20, 2021 – BASF has signed a global, exclusive supply agreement with Caregen, for four cosmetic peptides. With this expansion in its portfolio, BASF plans to launch four peptides with anti-aging and anti-pigmentation properties, for prone-atopic and prone-acneic skins, in the course of 2021.“There is robust growth in the dermocosmetics sector globally and especially in Asia, as more consumers desire targeted functionality from cosmetics, in line with the personalization trend. By using cosmetic products formulated with these peptides as active ingredients, consumers can have a choice of products with proven efficacy,” said Jeff Huh, Head of Marketing Personal Care Solutions at BASF Asia Pacific.“Peptides are widely used as biological actives in different markets. Selecting the most promising peptides from Caregen’s huge portfolio and adapting them to the standards of the cosmetic industry has been a great achievement of our team,” said David Hérault, Head of Global Product Development Bio-actives at BASF Care Chemicals.“Building on Caregen’s functional peptides with high technical and commercial competitiveness and BASF’s expertise in solutions and ingredients which are offered to the global cosmetics market, this agreement has both companies well set up for a long-term cooperation. With the ongoing functional health food and pharmaceutical product development, Caregen aims to soon become a leading peptide platform company, recognized by global customers,” said Dr. Chung Yong-Ji, CEO, Caregen. Caregen’s highly potent synthetic peptides will be a complementary technology to the bioactives and other cosmetic solutions and ingredients offered by BASF’s personal care portfolio." ) # print(prediction) if __name__ == "__main__": predictor = Predictor.from_path( "https://storage.googleapis.com/allennlp-public-models/ner-model-2020.02.10.tar.gz" ) predict(predictor)
def compute_accuracy_df( model_type: str, training_run: str, epoch: int, use_binary_labels: bool = True, add_lstm_preds: bool = False, use_roberta: bool = False, ) -> None: eval_dir_part = path.join('eval', model_type) if not path.exists(eval_dir_part): os.mkdir(eval_dir_part) eval_dir_whole = path.join(eval_dir_part, training_run) if not path.exists(eval_dir_whole): os.mkdir(eval_dir_whole) if not use_roberta: net = model.BERTFineTuningModel(2 if use_binary_labels else 3, RANDOM_SEED) else: net = model.RobertaFineTuningModel(2 if use_binary_labels else 3, RANDOM_SEED) net.load_state_dict( torch.load( path.join(BASE_MODEL_PATH, model_type, training_run, f'model_epoch_{epoch}'))) net.eval() tokenizer = (BertTokenizer.from_pretrained('bert-base-uncased') if not use_roberta else RobertaTokenizer.from_pretrained('roberta-base')) max_len = 80 if not use_roberta else 63 sentiment_analysis_dataloader = data.load_sentiment_analysis_data( tokenizer, 64, max_len, None, 2, use_binary_labels, False, ) if use_binary_labels: probs_positive, labels = compute_accuracy_two_class( sentiment_analysis_dataloader, net) else: probs_positive, labels = compute_accuracy_three_class( sentiment_analysis_dataloader, net, use_roberta) probs_str = 'bert_probs' if not use_roberta else 'roberta_probs' df = pd.DataFrame({ probs_str if add_lstm_preds else 'probs': probs_positive, 'labels': labels }) dataset = sentiment_analysis.SentimentAnalysisDataset(tokenizer, max_len, split_to_use=2) assert len(dataset.sentences) == df.shape[0] df['sentence'] = dataset.sentences if add_lstm_preds: predictor_lstm = Predictor.from_path(const.LSTM_PATH) preds_lstm = [ predictor_lstm.predict(sentence=dataset_sentence)['probs'][0] for dataset_sentence in dataset.sentences ] df['lstm_probs'] = preds_lstm df.to_csv(path.join(eval_dir_whole, f'test_accuracy_epoch_{epoch}.csv'), index=False)
import pandas as pd import sys if len(sys.argv) < 3: print("Usage: python pathtodata pathtooutput") sys.exit(1) import pandas as pd test = pd.read_csv(sys.argv[1]) from allennlp.predictors.predictor import Predictor predictor = Predictor.from_path( "../data/data/bidaf-model-2017.09.15-charpad.tar.gz") i = 0 single = {} for index, row in test.iterrows(): ans = predictor.predict(passage=row['contexts'], question=row['questions']) single[row['qids']] = ans['best_span_str'] i += 1 print(i, "Done!", end="\r") import json with open(sys.argv[2], "w") as write_file: json.dump(single, write_file) print("saved results to", sys.argv[2])
def __init__(self): self.predictor = Predictor.from_path( 'https://s3-us-west-2.amazonaws.com/allennlp/models/fine-grained-ner-model-elmo-2018.08.31.tar.gz' )
from allennlp.predictors.predictor import Predictor predictor = Predictor.from_path( "https://storage.googleapis.com/allennlp-public-models/bidaf-elmo-model-2018.11.30-charpad.tar.gz" ) def predict(sample, metadata): prediction = predictor.predict(passage=sample["passage"], question=sample["question"]) return prediction["best_span_str"]
def init_mrc(): global mrc from allennlp.predictors.predictor import Predictor import allennlp_models.rc mrc = Predictor.from_path("../bidaf-elmo-model-2020.03.19.tar.gz")
Make a machine comprehension prediction on the supplied input. See https://rajpurkar.github.io/SQuAD-explorer/ for more information about the machine comprehension task. Parameters ---------- question : ``str`` A question about the content in the supplied paragraph. The question must be answerable by a span in the paragraph. passage : ``str`` A paragraph of information relevant to the question. Returns ------- A dictionary that represents the prediction made by the system. The answer string will be under the "best_span_str" key. """ return self.predict_json({u"passage": passage, u"question": question}) #overrides def _json_to_instance(self, json_dict): u""" Expects JSON that looks like ``{"question": "...", "passage": "..."}``. """ question_text = json_dict[u"question"] passage_text = json_dict[u"passage"] return self._dataset_reader.text_to_instance(question_text, passage_text) BidafPredictor = Predictor.register(u'machine-comprehension')(BidafPredictor)
import collections from allennlp.predictors.predictor import Predictor PARSER = Predictor.from_path( "https://s3-us-west-2.amazonaws.com/allennlp/models/biaffine-dependency-parser-ptb-2018.08.23.tar.gz" ) def parse_dependency_tree(sentence): tree = PARSER.predict(sentence=sentence) words = tree['words'] pos = tree['pos'] deps = tree['predicted_dependencies'] heads = tree['predicted_heads'] return {'word': words, 'pos': pos, 'dep': deps, 'head': heads} def get_children(tree): matrix = [] for i in range(len(tree['head'])): children = [] for j in range(len(tree['head'])): if i == j: continue if tree['head'][j] == i + 1: children.append(j) matrix.append(tuple(children)) return matrix def get_head(tree):
from pprint import pprint import nltk nltk.data.path.append("/Volumes/Untitled 2/Users/sayeed") from nltk.stem import WordNetLemmatizer lemmatizer = WordNetLemmatizer() from allennlp.predictors.predictor import Predictor predictor = Predictor.from_path("bert-base-srl-2019.06.17.tar.gz") #matching two strings based on what they mean, not how they are written from pymongo import MongoClient import json def identify_parts(part_name, tags, words): start_tag = "B-"+part_name cont_tag = "I-"+part_name parts_list = [] temp = "" for i in range(len(tags)): if tags[i] == start_tag: if temp == "": temp = words[i] else: parts_list.append(temp) temp = words[i] elif tags[i] == cont_tag: temp = temp + " "+words[i] else: if temp != "":
for instance in self._dataset_reader.get_instance(json_dict): if instance is not None: predicted_dict = self.predict_instance(instance) return predicted_dict if __name__ == "__main__": parse = argparse.ArgumentParser("") parse.add_argument("model") parse.add_argument("dataset") parse.add_argument("output_file") parse.add_argument("--cuda_device", type=int, default=0) args = parse.parse_args() file_path = cached_path(args.model) archive = load_archive(file_path, cuda_device=args.cuda_device) predictor = Predictor.from_archive(archive, 'nabert_predictor') predictions = {} counter = 0 with open(args.dataset) as fr: for line in fr: content = json.loads(line) predicted_dict = predictor.predict_json(content) for qa_p in content["qa_pairs"]: # predicted_dict = {"question_id": qa_p["qid"], "answer": {"value": random.choice(content["context"].split(" "))}} predictions[predicted_dict["question_id"]] = [ predicted_dict["answer"]["value"] ] if counter % 100 == 0: print(counter) counter += 1 json.dump(predictions, open(args.output_file, 'w'))
import re import spacy nlp = spacy.load("en_core_web_lg") from allennlp.predictors.predictor import Predictor predictor = Predictor.from_path("Downloads/srl-model-2018.02.27.tar.gz") def return_dict(sent): """Returns a dictionary with all semantic role labels for an input sentence outputs a role-label : word dictionary structure. """ pred = predictor.predict(sentence=sent) dict_roles = dict() temp = '' a = nlp(sent) root = [x.text for x in a if x.dep_ == 'ROOT'][0] for each in pred['verbs']: if each['verb'] == root: temp = each['description'] roles_str = re.findall("\[(.*?)\]", temp) for each in roles_str: vals = each.split(':') dict_roles[vals[0]] = vals[1] return dict_roles return_dict("Joe hit me with a Knife") # {'ARG0': ' Joe', 'V': ' hit', 'ARG1': ' me', 'ARG2': ' with a Knife'}
""" Generate cloze style questions from existing mrqa datasets. """ import nltk from allennlp.predictors.predictor import Predictor import text_analyse import multiprocessing import json import sys import os import gzip from nltk.tokenize.treebank import TreebankWordDetokenizer from termcolor import colored predictor = Predictor.from_path("elmo-constituency-parser-2018.03.14.tar.gz") # High Level Answer Category def get_mask(label): if label in ['PERSON', 'NORP', 'ORG']: mask = 'PERSON/NORP/ORG' elif label in ['GPE', 'LOC', 'FAC']: mask = 'PLACE' elif label in ['PRODUCT', 'EVENT', 'WORKOFART', 'LAW', 'LANGUAGE']: mask = 'THING' elif label in ['TIME', 'DATA']: mask = 'TEMPORAL' elif label in ['PERCENT', 'MONEY', 'QUANTITY', 'ORDINAL', 'CARDINAL']: mask = 'NUMERIC' else: mask = label
import nltk from nltk.tokenize import word_tokenize from nltk.stem import PorterStemmer from allennlp.predictors.predictor import Predictor import allennlp_models.syntax.constituency_parser predictor = Predictor.from_path( "https://storage.googleapis.com/allennlp-public-models/elmo-constituency-parser-2020.02.10.tar.gz" ) traindict = { 'activate': [ [ "Here, we show that chronic treadmill exercise activates the mechanistic target of rapamycin (mTOR) pathway in mouse motor cortex.", ["treadmill exercise, activates, target"], "2019 Li Zhang Sci Adv. PMID: 31281888 Exercise Training Improves Motor Skill Learning via Selective Activation of mTOR" ], [ "Together, exercise activates mTOR pathway, which is necessary for spinogenesis, neuronal activation, and axonal myelination leading to improved motor learning.", ["exercise, activates, mTOR pathway"], "2019 Li Zhang Sci Adv. PMID: 31281888 Exercise Training Improves Motor Skill Learning via Selective Activation of mTOR" ], [ "In contrast, exposure to constant light, which perturbed the interval of inactivity (sleep) and led to the complete abolishment of activity/inactivity cycles, activated robustly proinflammatory state in the colon selectively via Stat3-dependent pathway.", ["exposure, activated, state"], "2017 Alena Sumova, Chronobiol Int. PMID: 29039977 Chronic Disruptions of Circadian Sleep Regulation Induce Specific Proinflammatory Responses in the Rat Colon" ], [ "Research has shown that the observation of another's movement activates the corresponding motor representation in the observer.", ["observation, activates, motor representation"], "Marcel Brass, 2018, J Exp Psychol Hum Percept Perform. PMID: 29154630 Automatic Imitation of Multiple Agents: Simultaneous or Random Representation?"
def __init__(self): # self.source_tgz = os.path.dirname(app.root_path) + "/sayhello/source/ner-elmo.2021-02-12.tar.gz" self.source_tgz = "https://storage.googleapis.com/allennlp-public-models/ner-elmo.2021-02-12.tar.gz" self.predictor = Predictor.from_path(self.source_tgz) print('load finish ner-elmo.2021-02-12.tar.gz')
import json from pprint import pprint import nltk nltk.data.path.append("/Volumes/Untitled 2/Users/sayeed") from nltk.stem import WordNetLemmatizer from nltk.tokenize import word_tokenize import gensim.downloader as api word_vectors = api.load("glove-wiki-gigaword-100") lemmatizer = WordNetLemmatizer() from allennlp.predictors.predictor import Predictor predictor = Predictor.from_path( "/Volumes/Untitled 2/Users/sayeed/bert-base-srl-2019.06.17.tar.gz") #matching two strings based on what they mean, not how they are written def calc_similarity(phrase1, phrase2): try: words1 = word_tokenize(phrase1.lower()) words2 = word_tokenize(phrase2.lower()) return word_vectors.n_similarity(words1, words2) except KeyError: return 0.0 def identify_parts(part_name, tags, words): start_tag = "B-" + part_name cont_tag = "I-" + part_name
if str(tag).startswith('B-'): phrase = results['words'][index].lower() flag = True elif flag: phrase += " " + results['words'][index].lower() if str(tag).startswith('L-'): flag = False entity.append(phrase) else: entity.append(results['words'][index].lower()) return entity, wordList if __name__ == '__main__': predicts = Predictor.from_path( "https://s3-us-west-2.amazonaws.com/allennlp/models/ner-model-2018.12.18.tar.gz" ) predictor = Predictor.from_path( "https://s3-us-west-2.amazonaws.com/allennlp/models/decomposable-attention-elmo-2018.02.19.tar.gz" ) with open('./devset500.json', 'r', encoding='utf-8') as f: d = json.load(f) f.close() fullResult = {} time1 = time.time() for key, content in d.items(): claim = content['claim'] # print(claim) normClaim = " ".join( [lemmatize(word.lower()) for word in claim.split(" ")])
def extract_genders(wino_mt_en, wino_mt_genders): wino_mt_preprop = Path(wino_mt_en).read_text().strip().split('\n') predictor = Predictor.from_path( "https://storage.googleapis.com/allennlp-public-models/coref-spanbert-large-2020.02.27.tar.gz" ) docs_genders = "" for text in tqdm(wino_mt_preprop[:]): result = predictor.predict(document=text) word_genders = ['U'] * len(result['document']) for i, cluster in enumerate(result['clusters']): mark = None # Find pronoun for span_start, span_end in cluster: # assume is in span of size 1 if span_end - span_start > 0: continue word = result['document'][span_start].lower() if word in {"he", "his", "him"}: mark = 'M' break elif word in {"she", "her", "hers"}: mark = 'F' break if mark is None: continue for span_start, span_end in cluster: for i in range(span_start, span_end + 1): word_genders[i] = mark doc_token_it = iter(zip(result['document'], word_genders)) # Find matching tokens sentence_genders = [] for sent in text.strip().split('\n'): token_genders = [] for token in sent.strip().split(' '): doc_token, doc_gender = next(doc_token_it) while doc_token == '\n': doc_token, doc_gender = next(doc_token_it) if token == doc_token: token_genders.append(doc_gender) continue # tokens differ start merging mark = 'U' try: while doc_token != token: next_doc_token, next_doc_gender = next(doc_token_it) while doc_token == '\n': doc_token, doc_gender = next(doc_token_it) doc_token += next_doc_token if next_doc_gender != 'U': mark = next_doc_gender except Exception as e: sys.stderr.write(f"Token: {token}\n") sys.stderr.write(f"DocTokten: {doc_token}\n") raise e token_genders.append(mark) sys.stdout.write(' '.join(token_genders) + '\n') sentence_genders.append(' '.join(token_genders)) sentence_genders = '\n'.join(sentence_genders) docs_genders += sentence_genders.strip() + "\n" # Evaluate produced gender marks vs gold annotations if wino_mt_genders: gold_genders = Path(wino_mt_genders) y_true = [ g.split() for g in gold_genders.read_text().strip().split('\n') ] y_pred = [g.split() for g in docs_genders.strip().split('\n')] # Flatten y_true = [gen for line in y_true for gen in line] y_pred = [gen for line in y_pred for gen in line] evaluation = precision_recall_fscore_support(y_true, y_pred, labels=['M', 'F', 'U']) sys.stderr.write( f"M p:{evaluation[0][0]} r:{evaluation[1][0]} f1:{evaluation[2][0]}\n" ) sys.stderr.write( f"F p:{evaluation[0][1]} r:{evaluation[1][1]} f1:{evaluation[2][1]}\n" )
import allennlp from allennlp.predictors.predictor import Predictor import pickle import itertools import numpy as np import random from nltk.stem import WordNetLemmatizer import gensim.downloader as api glove_model = api.load('glove-twitter-200') #initializing nlp tools predictor = Predictor.from_path("https://s3-us-west-2.amazonaws.com/allennlp/models/srl-model-2018.05.25.tar.gz") lemmatizer = WordNetLemmatizer() concept_edges_ = open("conceptNet_nyt_edges_swfree.txt").readlines()#pickle.load(open("concept_edges.pcl")) concept_edges = [] for line in concept_edges_: if line[0] == "p" and "\t" in line[2:6]: concept_edges.append([]) else: try: concept_edges[-1].append(eval(line.strip("\n"))) except: pass nlp = spacy.load('en_core_web_lg') import neuralcoref
# https://github.com/ffancellu/NegNN import ast import itertools import pandas as pd from collections import Counter from nltk import Tree import os # We use the allennlp parser (I liked the outputs I sampled.) # https://demo.allennlp.org/constituency-parsing/MTU5NjQxOQ== # from jiant.utils.data_loaders import tokenize_and_truncate from allennlp.predictors.predictor import Predictor predictor = Predictor.from_path( "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo-constituency-parser-2018.03.14.tar.gz" ) def nli(): files = [ "dev", "training", "sherlock_cardboard", "sherlock_circle", "unseen_full", "lexical_full", "mw_full", "prefixal_full", "simple_full", "suffixal_full",
from __future__ import absolute_import #overrides from allennlp.common.util import JsonDict from allennlp.data import Instance from allennlp.predictors.predictor import Predictor class SimpleSeq2SeqPredictor(Predictor): u""" Predictor for the :class:`~allennlp.models.encoder_decoder.simple_seq2seq` model. """ def predict(self, source): return self.predict_json({u"source": source}) #overrides def _json_to_instance(self, json_dict): u""" Expects JSON that looks like ``{"source": "..."}``. """ source = json_dict[u"source"] return self._dataset_reader.text_to_instance(source) SimpleSeq2SeqPredictor = Predictor.register(u'simple_seq2seq')( SimpleSeq2SeqPredictor)
graph.add_node(count, name=entity[ent]) if ent == 'nsubj': graph.add_edges_from([count, entity_id, {'name': 'definition'}]) else: graph.add_edges_from([count, entity_id, {'name': 'fact'}]) count += 1 if __name__ == '__main__': ap = ArgumentParser() ap.add_argument('-k', '--knowledge_graph', help='Pickle of the KG') ap.add_argument('-p', '--persona_file', help='PersonaChat file') ap.add_argument('-d', '--document_file', help='Document file') spacy.prefer_gpu() nlp = spacy.load("en_core_web_sm") predictor = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/bert-base-srl-2020.03.24.tar.gz") args = ap.parse_args() with open(args.document_file, 'r') as in_f: articles = in_f.readlines() # persona_traits = create_persona_traits(args.persona_file) with open(args.persona_file, 'r') as pers_file: persona_traits = pers_file.readlines() knowledge_graph = nx.read_gpickle(args.knowledge_graph) linker = EntityLinker() final_write = [] count = 0 for trait in persona_traits: if count == 10: break trait = trait.rstrip('\n') print(trait)
def prep(self): self.model = Predictor.from_path( "https://s3-us-west-2.amazonaws.com/allennlp/models/sst-2-basic-classifier-glove-2019.06.27.tar.gz" )
def init_coref_models(coref_models): SPACY_MODEL = spacy.load('en_core_web_lg') model_url = 'externals/data/coref-model-2018.02.05.tar.gz' archive = load_archive(model_url, cuda_device=0) ALLEN_COREF_MODEL = Predictor.from_archive(archive) model_url = 'externals/data/biaffine-dependency-parser-ptb-2018.08.23.tar.gz' archive = load_archive(model_url, cuda_device=0) ALLEN_DEP_MODEL = Predictor.from_archive(archive) model_url = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo-constituency-parser-2018.03.14.tar.gz' archive = load_archive(model_url, cuda_device=0) ALLEN_PARSE_MODEL = Predictor.from_archive(archive) HUGGINGFACE_COREF_MODEL = spacy.load('en_core_web_lg') neuralcoref.add_to_pipe(HUGGINGFACE_COREF_MODEL) STANFORD_CORENLP_PATH = 'externals/stanford-corenlp-full-2018-10-05/' server = CoreNLPServer( classpath=STANFORD_CORENLP_PATH, corenlp_options=AttrDict({ 'port': 9090, 'timeout': '600000', 'thread': '4', 'quiet': 'true', 'preload': 'tokenize,ssplit,pos,lemma,parse,depparse,ner,coref' })) server.start() STANFORD_SERVER_URL = server.url STANFORD_MODEL = CoreNLPParser(url=STANFORD_SERVER_URL) syntactic_distance_coref_model = StanfordSyntacticDistanceModel( STANFORD_MODEL) parallelism_coref_model = ParallelismModel(ALLEN_DEP_MODEL, SPACY_MODEL) url_title_coref_model = URLModel(STANFORD_MODEL) stanford_coref_model = StanfordCorefModel(STANFORD_MODEL, algo='statistical') allen_coref_model = AllenNLPCorefModel(ALLEN_COREF_MODEL, SPACY_MODEL) huggingface_coref_model = HuggingfaceCorefModel(HUGGINGFACE_COREF_MODEL) lee_coref_model = LeeEtAl2017( SPACY_MODEL, config={ 'name': 'final', 'log_root': 'externals/data/', 'model': 'externals/modified_e2e_coref/experiments.conf', 'context_embeddings_root': 'externals/data/', 'head_embeddings_root': 'externals/data/', 'char_vocab_root': 'externals/data/', 'device': 0 }) logger.info('Waiting a minute to allow all models to load.') time.sleep(60) model_instances = { 'syn': syntactic_distance_coref_model, 'par': parallelism_coref_model, 'url': url_title_coref_model, # 'stan': stanford_coref_model, 'allen': allen_coref_model, 'hug': huggingface_coref_model, 'lee': lee_coref_model } coref_models = {name: model_instances[name] for name in coref_models} return coref_models
def __init__(self): self.predictor = AllenNLPPredictor.from_path( "https://storage.googleapis.com/allennlp-public-models/bidaf-model-2020.02.10-charpad.tar.gz" )
def __init__(self): # self.source_tgz = os.path.dirname(app.root_path) + "/sayhello/source/openie-model.2020.03.26.tar.gz" self.source_tgz = "https://storage.googleapis.com/allennlp-public-models/openie-model.2020.03.26.tar.gz" # print(source_tgz) # print("Start Loading") self.predictor = Predictor.from_path(self.source_tgz)
def main(): """ Get a validation/test set, computes the compositional vectors of the noun compounds in the set, and saves the embeddings file. """ ap = argparse.ArgumentParser() ap.add_argument('composition_model_path', help='The composition model file (model.tar.gz)') ap.add_argument('nc_vocab', help='The noun compound vocabulary file') ap.add_argument('vocab', help='The word vocabulary file') ap.add_argument('out_vector_file', help='Where to save the gzipped file') args = ap.parse_args() with codecs.open(args.nc_vocab, 'r', 'utf-8') as f_in: nc_vocab = [line.strip().lower().replace('\t', ' ') for line in f_in] with codecs.open(args.vocab, 'r', 'utf-8') as f_in: vocab = [line.strip().lower().replace('\t', ' ') for line in f_in] vocab += ['_'.join(nc.split()) for nc in nc_vocab if len(nc.split()) == 2] logger.info(f'Loading model from {args.composition_model_path}') archive = load_archive(args.composition_model_path) model = archive.model with codecs.open(args.out_vector_file, 'a', 'utf-8') as f_out: logger.info(f'Computing vectors for the single words in {args.vocab}') reader = NCParaphraseDatasetReaderForWords() predictor = Predictor(model, dataset_reader=reader) for word in tqdm.tqdm(vocab): instance = reader.text_to_instance(word) if instance is None: logger.warning(f'Instance is None for {word}') else: curr_vector = predictor.predict_instance(instance)['vector'] if len(curr_vector) == 1: curr_vector = curr_vector[0] vector_text = ' '.join(map(str, curr_vector)).strip() f_out.write(f'dist_{word} {vector_text}\n') logger.info( f'Computing vectors for the noun compounds in {args.nc_vocab}') reader = NCParaphraseDatasetReader() for nc in tqdm.tqdm(nc_vocab): instance = reader.text_to_instance(nc) if instance is None: logger.warning(f'Instance is None for {nc}') else: curr_vector = predictor.predict_instance(instance)['vector'] if len(curr_vector) == 1: curr_vector = curr_vector[0] vector_text = ' '.join(map(str, curr_vector)).strip() nc = nc.replace(' ', '_') f_out.write(f'comp_{nc} {vector_text}\n')
def __init__(self): # self.source_tgz = os.path.dirname(app.root_path) + # "/sayhello/source/decomposable-attention-elmo-2020.04.09.tar.gz" self.source_tgz = "https://storage.googleapis.com/allennlp-public-models/decomposable-attention-elmo-2020.04.09.tar.gz" self.predictor = Predictor.from_path(self.source_tgz)
def _get_predictor(archive_file, predictor) -> Predictor: archive = load_archive(archive_file, cuda_device=-1) return Predictor.from_archive(archive, predictor)
def main(): # Setup Cuda if available, otherwise use the CPU device = -1 if torch.cuda.is_available(): device = torch.cuda.current_device() # Put data path here data_path = "data/raw/anne_bonnie.txt" save_path = "data/triples/" data_name = data_path.split('/')[-1] # Generate Models print("Generating models...") openie_model_url = "https://storage.googleapis.com/allennlp-public-models/openie-model.2020.03.26.tar.gz" openie_predictor = Predictor.from_path(openie_model_url, cuda_device=device) print("Generated openie predictor") spacy_sent = English() spacy_sent = spacy.load('en_core_web_sm') spacy_sent.add_pipe(spacy_sent.create_pipe('sentencizer')) print("Generated Spacy Sentencizer") print("Finished generating models") sentences = [] trimmed_triples = [] # Split text data into sentences all_sentences = get_all_sentences(spacy_sent, data_path) t = time.localtime() timestamp = time.strftime('%b-%d-%y_%H:%M', t) remove_bad_triples = True good_triples = 0 total_triples = len(all_sentences) # print("Doing co-reference analysis") # coref_data = get_coref_prediction(coref_predictor, text_data) for sent in all_sentences: print('Processing sentence:', sent.text) sentences.append(sent) # Get the root of the sentence sent_root = get_root_verb(sent) # print("Root Verb:", sent_root) # Extract a triple using OpenIE openie_result = create_openie_triple(openie_predictor, sent.text.strip()) # Get releveant triple relevant_triple = get_relevant_triple(openie_result, sent_root) # print("Selected Triple", str(relevant_triple)) # Tri the triple trimmed = trim_triple(spacy_sent, relevant_triple) trimmed_triples.append(trimmed) print("Trimmed Triple:", trimmed, "\n") if remove_bad_triples == True: if None in trimmed: sentences.pop() trimmed_triples.pop() else: good_triples += 1 # Put sentence and triple data into a pandas dataframe for exporting triples_data = pd.DataFrame({ 'Sentence': sentences, 'Trimmed Triple': trimmed_triples }) print(good_triples, " triples of total ", total_triples, " triples were extracted") # Store the DataFrame into a csv file for examination triples_data.to_csv( os.path.join(save_path + data_name + '_triples_ ' + timestamp + '.csv')) # Create graph object G = nx.Graph() file_name = data_name + ' Graph ' + timestamp # Add nodes to graph and connect images for triple in trimmed_triples: G.add_edge(triple[0], triple[1]) G.add_edge(triple[1], triple[2]) # Create graph picture pos = nx.spring_layout(G) fig = plt.figure(figsize=(45, 45)) fig.suptitle(file_name) nx.draw(G, pos, edge_color='black', width=1, linewidths=1, node_size=1000, node_color='seagreen', alpha=0.9, labels={node: node for node in G.nodes()}) # Save the graph as a picture plt.savefig( os.path.join(save_path + data_name + '_graph_' + timestamp + '.png'))