Esempio n. 1
0
def _get_predictor(args: argparse.Namespace) -> Predictor:
    check_for_gpu(args.cuda_device)
    archive = load_archive(args.archive_file,
                           weights_file=args.weights_file,
                           cuda_device=args.cuda_device,
                           overrides=args.overrides)

    return Predictor.from_archive(archive, args.predictor)
Esempio n. 2
0
from allennlp.predictors.predictor import Predictor

predictor = Predictor.from_path(
    "https://s3-us-west-2.amazonaws.com/allennlp/models/bidaf-model-2017.09.15-charpad.tar.gz"
)

print()
print()
print("================= MODEL OUTPUTS =================")

# json_result = predictor.predict(
#   passage="Robotics is an interdisciplinary branch of engineering and science that includes mechanical engineering, electrical engineering, computer science, and others. Robotics deals with the design, construction, operation, and use of robots, as well as computer systems for their control, sensory feedback, and information processing. These technologies are used to develop machines that can substitute for humans. Robots can be used in any situation and for any purpose, but today many are used in dangerous environments (including bomb detection and de-activation), manufacturing processes, or where humans cannot survive. Robots can take on any form but some are made to resemble humans in appearance. This is said to help in the acceptance of a robot in certain replicative behaviors usually performed by people. Such robots attempt to replicate walking, lifting, speech, cognition, and basically anything a human can do.",
#   question="What is Robotics?"
# )

json_result = predictor.predict(
    question="What do robots that resemble humans attempt to do?")
print()
print("ANSWER : " + json_result["best_span_str"])
print()
print("================ END OF OUTPUTS =================")
print()
from allennlp.predictors.predictor import Predictor
import allennlp_models.tagging
from decorators import *


@timer
def predict(predictor):
    prediction = predictor.predict(
        sentence=
        "Hong Kong, Seoul, Korea – January 20, 2021 – BASF has signed a global, exclusive supply agreement with Caregen, for four cosmetic peptides. With this expansion in its portfolio, BASF plans to launch four peptides with anti-aging and anti-pigmentation properties, for prone-atopic and prone-acneic skins, in the course of 2021.“There is robust growth in the dermocosmetics sector globally and especially in Asia, as more consumers desire targeted functionality from cosmetics, in line with the personalization trend. By using cosmetic products formulated with these peptides as active ingredients, consumers can have a choice of products with proven efficacy,” said Jeff Huh, Head of Marketing Personal Care Solutions at BASF Asia Pacific.“Peptides are widely used as biological actives in different markets. Selecting the most promising peptides from Caregen’s huge portfolio and adapting them to the standards of the cosmetic industry has been a great achievement of our team,” said David Hérault, Head of Global Product Development Bio-actives at BASF Care Chemicals.“Building on Caregen’s functional peptides with high technical and commercial competitiveness and BASF’s expertise in solutions and ingredients which are offered to the global cosmetics market, this agreement has both companies well set up for a long-term cooperation. With the ongoing functional health food and pharmaceutical product development, Caregen aims to soon become a leading peptide platform company, recognized by global customers,” said Dr. Chung Yong-Ji, CEO, Caregen. Caregen’s highly potent synthetic peptides will be a complementary technology to the bioactives and other cosmetic solutions and ingredients offered by BASF’s personal care portfolio."
    )
    # print(prediction)


if __name__ == "__main__":
    predictor = Predictor.from_path(
        "https://storage.googleapis.com/allennlp-public-models/ner-model-2020.02.10.tar.gz"
    )
    predict(predictor)
def compute_accuracy_df(
    model_type: str,
    training_run: str,
    epoch: int,
    use_binary_labels: bool = True,
    add_lstm_preds: bool = False,
    use_roberta: bool = False,
) -> None:
    eval_dir_part = path.join('eval', model_type)
    if not path.exists(eval_dir_part):
        os.mkdir(eval_dir_part)
    eval_dir_whole = path.join(eval_dir_part, training_run)
    if not path.exists(eval_dir_whole):
        os.mkdir(eval_dir_whole)

    if not use_roberta:
        net = model.BERTFineTuningModel(2 if use_binary_labels else 3,
                                        RANDOM_SEED)
    else:
        net = model.RobertaFineTuningModel(2 if use_binary_labels else 3,
                                           RANDOM_SEED)
    net.load_state_dict(
        torch.load(
            path.join(BASE_MODEL_PATH, model_type, training_run,
                      f'model_epoch_{epoch}')))
    net.eval()
    tokenizer = (BertTokenizer.from_pretrained('bert-base-uncased')
                 if not use_roberta else
                 RobertaTokenizer.from_pretrained('roberta-base'))

    max_len = 80 if not use_roberta else 63
    sentiment_analysis_dataloader = data.load_sentiment_analysis_data(
        tokenizer,
        64,
        max_len,
        None,
        2,
        use_binary_labels,
        False,
    )

    if use_binary_labels:
        probs_positive, labels = compute_accuracy_two_class(
            sentiment_analysis_dataloader, net)
    else:
        probs_positive, labels = compute_accuracy_three_class(
            sentiment_analysis_dataloader, net, use_roberta)

    probs_str = 'bert_probs' if not use_roberta else 'roberta_probs'
    df = pd.DataFrame({
        probs_str if add_lstm_preds else 'probs': probs_positive,
        'labels': labels
    })

    dataset = sentiment_analysis.SentimentAnalysisDataset(tokenizer,
                                                          max_len,
                                                          split_to_use=2)
    assert len(dataset.sentences) == df.shape[0]
    df['sentence'] = dataset.sentences

    if add_lstm_preds:
        predictor_lstm = Predictor.from_path(const.LSTM_PATH)
        preds_lstm = [
            predictor_lstm.predict(sentence=dataset_sentence)['probs'][0]
            for dataset_sentence in dataset.sentences
        ]
        df['lstm_probs'] = preds_lstm

    df.to_csv(path.join(eval_dir_whole, f'test_accuracy_epoch_{epoch}.csv'),
              index=False)
Esempio n. 5
0
import pandas as pd
import sys
if len(sys.argv) < 3:
    print("Usage: python pathtodata pathtooutput")
    sys.exit(1)

import pandas as pd
test = pd.read_csv(sys.argv[1])

from allennlp.predictors.predictor import Predictor
predictor = Predictor.from_path(
    "../data/data/bidaf-model-2017.09.15-charpad.tar.gz")
i = 0
single = {}
for index, row in test.iterrows():
    ans = predictor.predict(passage=row['contexts'], question=row['questions'])
    single[row['qids']] = ans['best_span_str']
    i += 1
    print(i, "Done!", end="\r")

import json
with open(sys.argv[2], "w") as write_file:
    json.dump(single, write_file)
print("saved results to", sys.argv[2])
Esempio n. 6
0
 def __init__(self):
     self.predictor = Predictor.from_path(
         'https://s3-us-west-2.amazonaws.com/allennlp/models/fine-grained-ner-model-elmo-2018.08.31.tar.gz'
     )
Esempio n. 7
0
from allennlp.predictors.predictor import Predictor

predictor = Predictor.from_path(
    "https://storage.googleapis.com/allennlp-public-models/bidaf-elmo-model-2018.11.30-charpad.tar.gz"
)


def predict(sample, metadata):
    prediction = predictor.predict(passage=sample["passage"],
                                   question=sample["question"])
    return prediction["best_span_str"]
Esempio n. 8
0
def init_mrc():
    global mrc
    from allennlp.predictors.predictor import Predictor
    import allennlp_models.rc
    mrc = Predictor.from_path("../bidaf-elmo-model-2020.03.19.tar.gz")
Esempio n. 9
0
        Make a machine comprehension prediction on the supplied input.
        See https://rajpurkar.github.io/SQuAD-explorer/ for more information about the machine comprehension task.

        Parameters
        ----------
        question : ``str``
            A question about the content in the supplied paragraph.  The question must be answerable by a
            span in the paragraph.
        passage : ``str``
            A paragraph of information relevant to the question.

        Returns
        -------
        A dictionary that represents the prediction made by the system.  The answer string will be under the
        "best_span_str" key.
        """
        return self.predict_json({u"passage": passage, u"question": question})

    #overrides
    def _json_to_instance(self, json_dict):
        u"""
        Expects JSON that looks like ``{"question": "...", "passage": "..."}``.
        """
        question_text = json_dict[u"question"]
        passage_text = json_dict[u"passage"]
        return self._dataset_reader.text_to_instance(question_text,
                                                     passage_text)


BidafPredictor = Predictor.register(u'machine-comprehension')(BidafPredictor)
import collections
from allennlp.predictors.predictor import Predictor

PARSER = Predictor.from_path(
    "https://s3-us-west-2.amazonaws.com/allennlp/models/biaffine-dependency-parser-ptb-2018.08.23.tar.gz"
)


def parse_dependency_tree(sentence):
    tree = PARSER.predict(sentence=sentence)
    words = tree['words']
    pos = tree['pos']
    deps = tree['predicted_dependencies']
    heads = tree['predicted_heads']
    return {'word': words, 'pos': pos, 'dep': deps, 'head': heads}


def get_children(tree):
    matrix = []
    for i in range(len(tree['head'])):
        children = []
        for j in range(len(tree['head'])):
            if i == j:
                continue
            if tree['head'][j] == i + 1:
                children.append(j)
        matrix.append(tuple(children))
    return matrix


def get_head(tree):
Esempio n. 11
0
from pprint import pprint
import nltk
nltk.data.path.append("/Volumes/Untitled 2/Users/sayeed")

from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
from allennlp.predictors.predictor import Predictor
predictor = Predictor.from_path("bert-base-srl-2019.06.17.tar.gz")

#matching two strings based on what they mean, not how they are written
from pymongo import MongoClient
import json

def identify_parts(part_name, tags, words):
    start_tag = "B-"+part_name
    cont_tag = "I-"+part_name

    parts_list = []
    temp = ""
    for i in range(len(tags)):
        if tags[i] == start_tag:
            if temp == "":
                temp = words[i]
            else:
                parts_list.append(temp)
                temp = words[i]
        elif tags[i] == cont_tag:
            temp = temp + " "+words[i]
        else:
             if temp != "":
Esempio n. 12
0
        for instance in self._dataset_reader.get_instance(json_dict):
            if instance is not None:
                predicted_dict = self.predict_instance(instance)
        return predicted_dict


if __name__ == "__main__":
    parse = argparse.ArgumentParser("")
    parse.add_argument("model")
    parse.add_argument("dataset")
    parse.add_argument("output_file")
    parse.add_argument("--cuda_device", type=int, default=0)
    args = parse.parse_args()
    file_path = cached_path(args.model)
    archive = load_archive(file_path, cuda_device=args.cuda_device)
    predictor = Predictor.from_archive(archive, 'nabert_predictor')
    predictions = {}
    counter = 0
    with open(args.dataset) as fr:
        for line in fr:
            content = json.loads(line)
            predicted_dict = predictor.predict_json(content)
            for qa_p in content["qa_pairs"]:
                # predicted_dict = {"question_id": qa_p["qid"], "answer": {"value": random.choice(content["context"].split(" "))}}
                predictions[predicted_dict["question_id"]] = [
                    predicted_dict["answer"]["value"]
                ]
            if counter % 100 == 0:
                print(counter)
            counter += 1
    json.dump(predictions, open(args.output_file, 'w'))
import re
import spacy
nlp = spacy.load("en_core_web_lg")
from allennlp.predictors.predictor import Predictor
predictor = Predictor.from_path("Downloads/srl-model-2018.02.27.tar.gz")


def return_dict(sent):
    """Returns a dictionary with all semantic role labels for an input sentence
       outputs a role-label : word dictionary structure.
    """

    pred = predictor.predict(sentence=sent)

    dict_roles = dict()
    temp = ''
    a = nlp(sent)
    root = [x.text for x in a if x.dep_ == 'ROOT'][0]
    for each in pred['verbs']:
        if each['verb'] == root:
            temp = each['description']
    roles_str = re.findall("\[(.*?)\]", temp)

    for each in roles_str:
        vals = each.split(':')
        dict_roles[vals[0]] = vals[1]
    return dict_roles


return_dict("Joe hit me with a Knife")
# {'ARG0': ' Joe', 'V': ' hit', 'ARG1': ' me', 'ARG2': ' with a Knife'}
Esempio n. 14
0
"""
Generate cloze style questions from existing mrqa datasets.
"""
import nltk
from allennlp.predictors.predictor import Predictor
import text_analyse
import multiprocessing
import json
import sys
import os
import gzip
from nltk.tokenize.treebank import TreebankWordDetokenizer
from termcolor import colored

predictor = Predictor.from_path("elmo-constituency-parser-2018.03.14.tar.gz")


# High Level Answer Category
def get_mask(label):
    if label in ['PERSON', 'NORP', 'ORG']:
        mask = 'PERSON/NORP/ORG'
    elif label in ['GPE', 'LOC', 'FAC']:
        mask = 'PLACE'
    elif label in ['PRODUCT', 'EVENT', 'WORKOFART', 'LAW', 'LANGUAGE']:
        mask = 'THING'
    elif label in ['TIME', 'DATA']:
        mask = 'TEMPORAL'
    elif label in ['PERCENT', 'MONEY', 'QUANTITY', 'ORDINAL', 'CARDINAL']:
        mask = 'NUMERIC'
    else:
        mask = label
Esempio n. 15
0
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

from allennlp.predictors.predictor import Predictor
import allennlp_models.syntax.constituency_parser
predictor = Predictor.from_path(
    "https://storage.googleapis.com/allennlp-public-models/elmo-constituency-parser-2020.02.10.tar.gz"
)

traindict = {
    'activate': [
        [
            "Here, we show that chronic treadmill exercise activates the mechanistic target of rapamycin (mTOR) pathway in mouse motor cortex.",
            ["treadmill exercise, activates, target"],
            "2019 Li Zhang Sci Adv. PMID: 31281888 Exercise Training Improves Motor Skill Learning via Selective Activation of mTOR"
        ],
        [
            "Together, exercise activates mTOR pathway, which is necessary for spinogenesis, neuronal activation, and axonal myelination leading to improved motor learning.",
            ["exercise, activates, mTOR pathway"],
            "2019 Li Zhang Sci Adv. PMID: 31281888 Exercise Training Improves Motor Skill Learning via Selective Activation of mTOR"
        ],
        [
            "In contrast, exposure to constant light, which perturbed the interval of inactivity (sleep) and led to the complete abolishment of activity/inactivity cycles, activated robustly proinflammatory state in the colon selectively via Stat3-dependent pathway.",
            ["exposure, activated, state"],
            "2017 Alena Sumova, Chronobiol Int. PMID: 29039977 Chronic Disruptions of Circadian Sleep Regulation Induce Specific Proinflammatory Responses in the Rat Colon"
        ],
        [
            "Research has shown that the observation of another's movement activates the corresponding motor representation in the observer.",
            ["observation, activates, motor representation"],
            "Marcel Brass, 2018, J Exp Psychol Hum Percept Perform. PMID: 29154630 Automatic Imitation of Multiple Agents: Simultaneous or Random Representation?"
 def __init__(self):
     # self.source_tgz = os.path.dirname(app.root_path) + "/sayhello/source/ner-elmo.2021-02-12.tar.gz"
     self.source_tgz = "https://storage.googleapis.com/allennlp-public-models/ner-elmo.2021-02-12.tar.gz"
     self.predictor = Predictor.from_path(self.source_tgz)
     print('load finish ner-elmo.2021-02-12.tar.gz')
import json
from pprint import pprint
import nltk
nltk.data.path.append("/Volumes/Untitled 2/Users/sayeed")

from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import gensim.downloader as api

word_vectors = api.load("glove-wiki-gigaword-100")

lemmatizer = WordNetLemmatizer()
from allennlp.predictors.predictor import Predictor
predictor = Predictor.from_path(
    "/Volumes/Untitled 2/Users/sayeed/bert-base-srl-2019.06.17.tar.gz")

#matching two strings based on what they mean, not how they are written


def calc_similarity(phrase1, phrase2):
    try:
        words1 = word_tokenize(phrase1.lower())
        words2 = word_tokenize(phrase2.lower())
        return word_vectors.n_similarity(words1, words2)
    except KeyError:
        return 0.0


def identify_parts(part_name, tags, words):
    start_tag = "B-" + part_name
    cont_tag = "I-" + part_name
Esempio n. 18
0
            if str(tag).startswith('B-'):
                phrase = results['words'][index].lower()
                flag = True
            elif flag:
                phrase += " " + results['words'][index].lower()
                if str(tag).startswith('L-'):
                    flag = False
                    entity.append(phrase)
            else:
                entity.append(results['words'][index].lower())
    return entity, wordList


if __name__ == '__main__':
    predicts = Predictor.from_path(
        "https://s3-us-west-2.amazonaws.com/allennlp/models/ner-model-2018.12.18.tar.gz"
    )
    predictor = Predictor.from_path(
        "https://s3-us-west-2.amazonaws.com/allennlp/models/decomposable-attention-elmo-2018.02.19.tar.gz"
    )
    with open('./devset500.json', 'r', encoding='utf-8') as f:
        d = json.load(f)
        f.close()

    fullResult = {}
    time1 = time.time()
    for key, content in d.items():
        claim = content['claim']
        # print(claim)
        normClaim = " ".join(
            [lemmatize(word.lower()) for word in claim.split(" ")])
def extract_genders(wino_mt_en, wino_mt_genders):
    wino_mt_preprop = Path(wino_mt_en).read_text().strip().split('\n')

    predictor = Predictor.from_path(
        "https://storage.googleapis.com/allennlp-public-models/coref-spanbert-large-2020.02.27.tar.gz"
    )

    docs_genders = ""
    for text in tqdm(wino_mt_preprop[:]):

        result = predictor.predict(document=text)

        word_genders = ['U'] * len(result['document'])
        for i, cluster in enumerate(result['clusters']):

            mark = None
            # Find pronoun
            for span_start, span_end in cluster:
                # assume is in span of size 1
                if span_end - span_start > 0:
                    continue

                word = result['document'][span_start].lower()

                if word in {"he", "his", "him"}:
                    mark = 'M'
                    break
                elif word in {"she", "her", "hers"}:
                    mark = 'F'
                    break

            if mark is None:
                continue

            for span_start, span_end in cluster:
                for i in range(span_start, span_end + 1):
                    word_genders[i] = mark

        doc_token_it = iter(zip(result['document'], word_genders))

        # Find matching tokens
        sentence_genders = []
        for sent in text.strip().split('\n'):
            token_genders = []
            for token in sent.strip().split(' '):
                doc_token, doc_gender = next(doc_token_it)
                while doc_token == '\n':
                    doc_token, doc_gender = next(doc_token_it)

                if token == doc_token:
                    token_genders.append(doc_gender)
                    continue

                # tokens differ start merging
                mark = 'U'
                try:
                    while doc_token != token:
                        next_doc_token, next_doc_gender = next(doc_token_it)
                        while doc_token == '\n':
                            doc_token, doc_gender = next(doc_token_it)
                        doc_token += next_doc_token
                        if next_doc_gender != 'U':
                            mark = next_doc_gender
                except Exception as e:
                    sys.stderr.write(f"Token: {token}\n")
                    sys.stderr.write(f"DocTokten: {doc_token}\n")
                    raise e
                token_genders.append(mark)
            sys.stdout.write(' '.join(token_genders) + '\n')
            sentence_genders.append(' '.join(token_genders))
        sentence_genders = '\n'.join(sentence_genders)
        docs_genders += sentence_genders.strip() + "\n"

    # Evaluate produced gender marks vs gold annotations
    if wino_mt_genders:
        gold_genders = Path(wino_mt_genders)

        y_true = [
            g.split() for g in gold_genders.read_text().strip().split('\n')
        ]
        y_pred = [g.split() for g in docs_genders.strip().split('\n')]

        # Flatten
        y_true = [gen for line in y_true for gen in line]
        y_pred = [gen for line in y_pred for gen in line]

        evaluation = precision_recall_fscore_support(y_true,
                                                     y_pred,
                                                     labels=['M', 'F', 'U'])
        sys.stderr.write(
            f"M p:{evaluation[0][0]} r:{evaluation[1][0]} f1:{evaluation[2][0]}\n"
        )
        sys.stderr.write(
            f"F p:{evaluation[0][1]} r:{evaluation[1][1]} f1:{evaluation[2][1]}\n"
        )
Esempio n. 20
0
import allennlp
from allennlp.predictors.predictor import Predictor
import pickle
import itertools
import numpy as np
import random
from nltk.stem import WordNetLemmatizer 
import gensim.downloader as api


glove_model = api.load('glove-twitter-200')



#initializing nlp tools
predictor = Predictor.from_path("https://s3-us-west-2.amazonaws.com/allennlp/models/srl-model-2018.05.25.tar.gz")
lemmatizer = WordNetLemmatizer() 

concept_edges_ = open("conceptNet_nyt_edges_swfree.txt").readlines()#pickle.load(open("concept_edges.pcl"))
concept_edges = []
for line in concept_edges_:
	if line[0] == "p" and "\t" in line[2:6]:
		concept_edges.append([])
	else:
		try:
			concept_edges[-1].append(eval(line.strip("\n")))
		except:
			pass
nlp = spacy.load('en_core_web_lg')

import neuralcoref
Esempio n. 21
0
# https://github.com/ffancellu/NegNN

import ast
import itertools
import pandas as pd
from collections import Counter
from nltk import Tree
import os

# We use the allennlp parser (I liked the outputs I sampled.)
# https://demo.allennlp.org/constituency-parsing/MTU5NjQxOQ==
# from jiant.utils.data_loaders import tokenize_and_truncate
from allennlp.predictors.predictor import Predictor

predictor = Predictor.from_path(
    "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo-constituency-parser-2018.03.14.tar.gz"
)


def nli():
    files = [
        "dev",
        "training",
        "sherlock_cardboard",
        "sherlock_circle",
        "unseen_full",
        "lexical_full",
        "mw_full",
        "prefixal_full",
        "simple_full",
        "suffixal_full",
Esempio n. 22
0
from __future__ import absolute_import
#overrides

from allennlp.common.util import JsonDict
from allennlp.data import Instance
from allennlp.predictors.predictor import Predictor


class SimpleSeq2SeqPredictor(Predictor):
    u"""
    Predictor for the :class:`~allennlp.models.encoder_decoder.simple_seq2seq` model.
    """
    def predict(self, source):
        return self.predict_json({u"source": source})

    #overrides
    def _json_to_instance(self, json_dict):
        u"""
        Expects JSON that looks like ``{"source": "..."}``.
        """
        source = json_dict[u"source"]
        return self._dataset_reader.text_to_instance(source)


SimpleSeq2SeqPredictor = Predictor.register(u'simple_seq2seq')(
    SimpleSeq2SeqPredictor)
            graph.add_node(count, name=entity[ent])
            if ent == 'nsubj':
                graph.add_edges_from([count, entity_id, {'name': 'definition'}])
            else:
                graph.add_edges_from([count, entity_id, {'name': 'fact'}])
            count += 1


if __name__ == '__main__':
    ap = ArgumentParser()
    ap.add_argument('-k', '--knowledge_graph', help='Pickle of the KG')
    ap.add_argument('-p', '--persona_file', help='PersonaChat file')
    ap.add_argument('-d', '--document_file', help='Document file')
    spacy.prefer_gpu()
    nlp = spacy.load("en_core_web_sm")
    predictor = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/bert-base-srl-2020.03.24.tar.gz")
    args = ap.parse_args()
    with open(args.document_file, 'r') as in_f:
        articles = in_f.readlines()
    # persona_traits = create_persona_traits(args.persona_file)
    with open(args.persona_file, 'r') as pers_file:
        persona_traits = pers_file.readlines()
    knowledge_graph = nx.read_gpickle(args.knowledge_graph)
    linker = EntityLinker()
    final_write = []
    count = 0
    for trait in persona_traits:
        if count == 10:
            break
        trait = trait.rstrip('\n')
        print(trait)
 def prep(self):
     self.model = Predictor.from_path(
         "https://s3-us-west-2.amazonaws.com/allennlp/models/sst-2-basic-classifier-glove-2019.06.27.tar.gz"
     )
Esempio n. 25
0
def init_coref_models(coref_models):
    SPACY_MODEL = spacy.load('en_core_web_lg')

    model_url = 'externals/data/coref-model-2018.02.05.tar.gz'
    archive = load_archive(model_url, cuda_device=0)
    ALLEN_COREF_MODEL = Predictor.from_archive(archive)

    model_url = 'externals/data/biaffine-dependency-parser-ptb-2018.08.23.tar.gz'
    archive = load_archive(model_url, cuda_device=0)
    ALLEN_DEP_MODEL = Predictor.from_archive(archive)

    model_url = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo-constituency-parser-2018.03.14.tar.gz'
    archive = load_archive(model_url, cuda_device=0)
    ALLEN_PARSE_MODEL = Predictor.from_archive(archive)

    HUGGINGFACE_COREF_MODEL = spacy.load('en_core_web_lg')
    neuralcoref.add_to_pipe(HUGGINGFACE_COREF_MODEL)

    STANFORD_CORENLP_PATH = 'externals/stanford-corenlp-full-2018-10-05/'
    server = CoreNLPServer(
        classpath=STANFORD_CORENLP_PATH,
        corenlp_options=AttrDict({
            'port':
            9090,
            'timeout':
            '600000',
            'thread':
            '4',
            'quiet':
            'true',
            'preload':
            'tokenize,ssplit,pos,lemma,parse,depparse,ner,coref'
        }))
    server.start()
    STANFORD_SERVER_URL = server.url
    STANFORD_MODEL = CoreNLPParser(url=STANFORD_SERVER_URL)

    syntactic_distance_coref_model = StanfordSyntacticDistanceModel(
        STANFORD_MODEL)
    parallelism_coref_model = ParallelismModel(ALLEN_DEP_MODEL, SPACY_MODEL)
    url_title_coref_model = URLModel(STANFORD_MODEL)
    stanford_coref_model = StanfordCorefModel(STANFORD_MODEL,
                                              algo='statistical')
    allen_coref_model = AllenNLPCorefModel(ALLEN_COREF_MODEL, SPACY_MODEL)
    huggingface_coref_model = HuggingfaceCorefModel(HUGGINGFACE_COREF_MODEL)
    lee_coref_model = LeeEtAl2017(
        SPACY_MODEL,
        config={
            'name': 'final',
            'log_root': 'externals/data/',
            'model': 'externals/modified_e2e_coref/experiments.conf',
            'context_embeddings_root': 'externals/data/',
            'head_embeddings_root': 'externals/data/',
            'char_vocab_root': 'externals/data/',
            'device': 0
        })

    logger.info('Waiting a minute to allow all models to load.')

    time.sleep(60)

    model_instances = {
        'syn': syntactic_distance_coref_model,
        'par': parallelism_coref_model,
        'url': url_title_coref_model,
        # 'stan': stanford_coref_model,
        'allen': allen_coref_model,
        'hug': huggingface_coref_model,
        'lee': lee_coref_model
    }

    coref_models = {name: model_instances[name] for name in coref_models}

    return coref_models
Esempio n. 26
0
 def __init__(self):
     self.predictor = AllenNLPPredictor.from_path(
         "https://storage.googleapis.com/allennlp-public-models/bidaf-model-2020.02.10-charpad.tar.gz"
     )
 def __init__(self):
     # self.source_tgz = os.path.dirname(app.root_path) + "/sayhello/source/openie-model.2020.03.26.tar.gz"
     self.source_tgz = "https://storage.googleapis.com/allennlp-public-models/openie-model.2020.03.26.tar.gz"
     # print(source_tgz)
     # print("Start Loading")
     self.predictor = Predictor.from_path(self.source_tgz)
Esempio n. 28
0
def main():
    """
    Get a validation/test set, computes the compositional vectors of
    the noun compounds in the set, and saves the embeddings file.
    """
    ap = argparse.ArgumentParser()
    ap.add_argument('composition_model_path',
                    help='The composition model file (model.tar.gz)')
    ap.add_argument('nc_vocab', help='The noun compound vocabulary file')
    ap.add_argument('vocab', help='The word vocabulary file')
    ap.add_argument('out_vector_file', help='Where to save the gzipped file')
    args = ap.parse_args()

    with codecs.open(args.nc_vocab, 'r', 'utf-8') as f_in:
        nc_vocab = [line.strip().lower().replace('\t', ' ') for line in f_in]

    with codecs.open(args.vocab, 'r', 'utf-8') as f_in:
        vocab = [line.strip().lower().replace('\t', ' ') for line in f_in]

    vocab += ['_'.join(nc.split()) for nc in nc_vocab if len(nc.split()) == 2]

    logger.info(f'Loading model from {args.composition_model_path}')
    archive = load_archive(args.composition_model_path)
    model = archive.model

    with codecs.open(args.out_vector_file, 'a', 'utf-8') as f_out:
        logger.info(f'Computing vectors for the single words in {args.vocab}')
        reader = NCParaphraseDatasetReaderForWords()
        predictor = Predictor(model, dataset_reader=reader)

        for word in tqdm.tqdm(vocab):
            instance = reader.text_to_instance(word)

            if instance is None:
                logger.warning(f'Instance is None for {word}')
            else:
                curr_vector = predictor.predict_instance(instance)['vector']

                if len(curr_vector) == 1:
                    curr_vector = curr_vector[0]

                vector_text = ' '.join(map(str, curr_vector)).strip()
                f_out.write(f'dist_{word} {vector_text}\n')

        logger.info(
            f'Computing vectors for the noun compounds in {args.nc_vocab}')
        reader = NCParaphraseDatasetReader()
        for nc in tqdm.tqdm(nc_vocab):
            instance = reader.text_to_instance(nc)

            if instance is None:
                logger.warning(f'Instance is None for {nc}')
            else:
                curr_vector = predictor.predict_instance(instance)['vector']

                if len(curr_vector) == 1:
                    curr_vector = curr_vector[0]

                vector_text = ' '.join(map(str, curr_vector)).strip()
                nc = nc.replace(' ', '_')
                f_out.write(f'comp_{nc} {vector_text}\n')
 def __init__(self):
     # self.source_tgz = os.path.dirname(app.root_path) +
     # "/sayhello/source/decomposable-attention-elmo-2020.04.09.tar.gz"
     self.source_tgz = "https://storage.googleapis.com/allennlp-public-models/decomposable-attention-elmo-2020.04.09.tar.gz"
     self.predictor = Predictor.from_path(self.source_tgz)
Esempio n. 30
0
def _get_predictor(archive_file, predictor) -> Predictor:
    archive = load_archive(archive_file, cuda_device=-1)

    return Predictor.from_archive(archive, predictor)
Esempio n. 31
0
def main():

    # Setup Cuda if available, otherwise use the CPU
    device = -1

    if torch.cuda.is_available():
        device = torch.cuda.current_device()

    # Put data path here
    data_path = "data/raw/anne_bonnie.txt"
    save_path = "data/triples/"
    data_name = data_path.split('/')[-1]

    # Generate Models
    print("Generating models...")
    openie_model_url = "https://storage.googleapis.com/allennlp-public-models/openie-model.2020.03.26.tar.gz"
    openie_predictor = Predictor.from_path(openie_model_url,
                                           cuda_device=device)
    print("Generated openie predictor")

    spacy_sent = English()
    spacy_sent = spacy.load('en_core_web_sm')
    spacy_sent.add_pipe(spacy_sent.create_pipe('sentencizer'))
    print("Generated Spacy Sentencizer")

    print("Finished generating models")

    sentences = []
    trimmed_triples = []

    # Split text data into sentences
    all_sentences = get_all_sentences(spacy_sent, data_path)

    t = time.localtime()
    timestamp = time.strftime('%b-%d-%y_%H:%M', t)

    remove_bad_triples = True
    good_triples = 0
    total_triples = len(all_sentences)

    # print("Doing co-reference analysis")
    # coref_data = get_coref_prediction(coref_predictor, text_data)

    for sent in all_sentences:
        print('Processing sentence:', sent.text)
        sentences.append(sent)

        # Get the root of the sentence
        sent_root = get_root_verb(sent)
        # print("Root Verb:", sent_root)

        # Extract a triple using OpenIE
        openie_result = create_openie_triple(openie_predictor,
                                             sent.text.strip())

        # Get releveant triple
        relevant_triple = get_relevant_triple(openie_result, sent_root)
        # print("Selected Triple", str(relevant_triple))

        # Tri the triple
        trimmed = trim_triple(spacy_sent, relevant_triple)
        trimmed_triples.append(trimmed)
        print("Trimmed Triple:", trimmed, "\n")

        if remove_bad_triples == True:
            if None in trimmed:
                sentences.pop()
                trimmed_triples.pop()
            else:
                good_triples += 1

    # Put sentence and triple data into a pandas dataframe for exporting
    triples_data = pd.DataFrame({
        'Sentence': sentences,
        'Trimmed Triple': trimmed_triples
    })

    print(good_triples, " triples of total ", total_triples,
          " triples were extracted")

    # Store the DataFrame into a csv file for examination
    triples_data.to_csv(
        os.path.join(save_path + data_name + '_triples_ ' + timestamp +
                     '.csv'))

    # Create graph object
    G = nx.Graph()

    file_name = data_name + ' Graph ' + timestamp
    # Add nodes to graph and connect images
    for triple in trimmed_triples:
        G.add_edge(triple[0], triple[1])
        G.add_edge(triple[1], triple[2])

    # Create graph picture
    pos = nx.spring_layout(G)
    fig = plt.figure(figsize=(45, 45))
    fig.suptitle(file_name)
    nx.draw(G,
            pos,
            edge_color='black',
            width=1,
            linewidths=1,
            node_size=1000,
            node_color='seagreen',
            alpha=0.9,
            labels={node: node
                    for node in G.nodes()})

    # Save the graph as a picture
    plt.savefig(
        os.path.join(save_path + data_name + '_graph_' + timestamp + '.png'))