Beispiel #1
0
import torch
import spacy
import neuralcoref
from pytorch_transformers import *
from gensim.parsing.porter import PorterStemmer
import question_generation.datasets as data

nlp = spacy.load("en_core_web_lg")
USE_COREF = False

if (USE_COREF):
    neuralcoref.add_to_pipe(nlp)


class nlp_engine:
    def __init__(self, ):
        # self.tokenizer   = BertTokenizer.from_pretrained("bert-base-uncased")
        # self.model       = BertModel.from_pretrained("bert-base-uncased")
        self.use_coref = False
        self.vectorizer = data.vectorize('fast')
        self.stemmer = PorterStemmer()

    def make_multiple_choice(self, word, sentence, ai=False):
        if (len(word.split(' ')) == 1):
            if (word in sentence.split(' ')):
                most = self.vectorizer.most_similar(word.lower(), topn=20)
                choices = [x[0].lower() for x in most]
                tmp = list()
                stems = list()

                tmp.append(word.lower())
Beispiel #2
0
 def __init__(self):
     self.nlp = spacy.load('en')
     neuralcoref.add_to_pipe(self.nlp)
Beispiel #3
0
 def __init__(self, model_name):
   self.nlp = spacy.load(model_name)
   neuralcoref.add_to_pipe(self.nlp, greedyness=0.45)
   self.textclean = TextClean()
Beispiel #4
0
def get_doc_example(text):
    nlp = spacy.load("en_core_web_sm")
    neuralcoref.add_to_pipe(nlp)
    doc = nlp(text)
    return doc
Beispiel #5
0
    'Dudley': ['man', 'boy'],
    'Ron': ['man', 'boy'],
    'Hagrid': ['man'],
    'Albus Dumbledore': ['Albus', 'headmaster', 'him', 'he'],
    'Draco': ['man', 'boy', 'Malfoy'],
    'McGongall': ['woman', 'McGongall', 'professor'],
    ' Snape': ['man', 'Snape', 'him', 'he', 'professor', 'Severus'],
    'Professor Quirrell': ['man', 'Quirrell'],
    'Harry': ['man', 'boy', 'Harry Potter', 'Potter', 'his'],
    'Hermione': ['she', 'woman', 'girl', 'Hermione granger', 'her'],
    'Ginny': ['she', 'her', 'woman', 'girl'],
    "Argus Flich": ['he', 'man'],
    'Voldemort': ['man', 'the Dark Lord', 'his', 'Lord Voldemort']
}

neuralcoref.add_to_pipe(nlp, max_dist_match=1000, greedyness=.40)

hp_num = input("Enter harry potter book #: ")

nlp.get_pipe('neuralcoref').set_conv_dict(rare_words)

source_file = f"harrypotter{hp_num}.txt"
temp_file = f"harrypotter{hp_num}_clean.txt"

f = open(source_file)
f1 = open(temp_file, 'w')
"""replace punctuation with special char inside quotes so conversations are not treated as individual sentences"""


def removePunctFromQuotes(text):
    returnText = ""
Beispiel #6
0
import spacy
import neuralcoref

nlp = spacy.load('en')
neuralcoref.add_to_pipe(nlp, greedyness=0.55)  #0.55 is best value I've seen

str1 = u"Norway has a lot of electric cars, so many that it can make anyone driving a new vehicle with an internal combustion engine look like a Luddite. Mercedes-Benz brought us there to experience the EQC, and possibly to normalize it in a sea of EVs that makes California look like a land of late adopters. Outside Oslo, where cars were larger and more upscale than in other parts of Europe, and Tesla vehicles (S and X) are a more common sight than around Los Angeles or the Bay Area, the EQC fit right in. DON'T MISS: Mercedes-Benz EQC Edition 1886 electric SUV kicks off a new era \
After a couple of rain-soaked days driving the EQC there last week, we can say that it will be a great addition in the U.S. when it arrives sometime in 2020. At about 187 inches long, the EQC400 4Matic crossover splices into the American mid-sizers."

doc = nlp(str1)
# doc = nlp(u'John went to Norway and it was beautiful.')

print(doc._.coref_clusters)
print(doc._.coref_clusters[0].mentions)
print(str1)
print(doc._.coref_resolved)
Beispiel #7
0
def get_coref_parser():
    nlp = spacy.load('en_core_web_sm')
    neuralcoref.add_to_pipe(nlp)
    nlp.tokenizer = NoTokenizer(nlp.vocab)
    return nlp
Beispiel #8
0
 def __init__(self):
     self.nlp = spacy.load('en')
     neuralcoref.add_to_pipe(self.nlp)
     print("Server loaded")
     self.response = None
 def __init__(self,
              spacy_model: str = 'zh_core_web_lg',
              greedyness: float = 0.45):
     self.nlp = spacy.load(spacy_model)
     neuralcoref.add_to_pipe(self.nlp, greedyness=greedyness)
Beispiel #10
0
 def __init__(self, language = "en_core_web_sm", greedyness: float = 0.65):
     super().__init__()
     import spacy
     self.nlp = spacy.load(language)
     neuralcoref.add_to_pipe(self.nlp)
Beispiel #11
0
def replace_pronouns(orig_text,
                     name,
                     pronoun_replacement,
                     nlp,
                     correcting_they_pronouns=False):

    #for singular they transformation
    #text, indexes to revert.
    #cache?

    if (correcting_they_pronouns):
        print('here')
        nlp.remove_pipe(
            "neuralcoref"
        )  # This remove the current neuralcoref instance from SpaCy pipe
        neuralcoref.add_to_pipe(nlp, greedyness=0.6)

    doc = nlp(orig_text)
    text = []
    buffer_start = 0
    name_cluster = find_cluster(name, doc)
    if name_cluster == None:
        nlp.remove_pipe(
            "neuralcoref"
        )  # This remove the current neuralcoref instance from SpaCy pipe
        neuralcoref.add_to_pipe(nlp, greedyness=0.6)
        doc = nlp(orig_text)
        name_cluster = find_cluster(name, doc)
        if (name_cluster == None):
            return [{'text': 'No Match', 'is_pronoun': False}]
    #need to check if mention is pronoun
    print(name_cluster.mentions)

    pronouns = list_pronouns(name_cluster.mentions, pronoun_replacement)
    present_tense_heads = list_present_tense_heads(pronouns,
                                                   pronoun_replacement)

    print(present_tense_heads)
    altered_tokens = pronouns + present_tense_heads
    altered_tokens.sort(key=lambda altered_token: altered_token['token'].i)
    print('altered_tokens', altered_tokens)
    for altered_token in altered_tokens:

        if altered_token[
                'token'].i > buffer_start:  # If we've skxipped over some tokens, let's add those in (with trailing whitespace if available)
            text += [{
                'text':
                doc[buffer_start:altered_token['token'].i].text +
                doc[altered_token['token'].i - 1].whitespace_,
                'is_pronoun':
                False,
                'index':
                altered_token['token'].i,
                'orig_text':
                altered_token['token'].text
            }]
        text += [{
            'text': altered_token['replacement_text'],
            'is_pronoun': True,
            'index': altered_token['token'].i,
            'orig_text': altered_token['token'].text
        }]
        text += [{
            'text': doc[altered_token['token'].i].whitespace_,
            'is_pronoun': False,
            'index': altered_token['token'].i,
            'orig_text': altered_token['token'].text
        }]  # Replace token, with trailing whitespace if available
        buffer_start = altered_token['token'].i + 1

    text += [{'text': doc[buffer_start:].text, 'is_pronoun': False}]

    text = [
        altered_token for altered_token in text if altered_token["text"] != ''
    ]

    return text
Beispiel #12
0
# -*- coding: utf-8 -*-
"""
Created on Thu Jun 27 19:55:37 2019

@author: sudhe
"""
import spacy
import neuralcoref
import numpy as np
import networkx as nx
from sklearn.metrics.pairwise import cosine_similarity

coref_nlp = spacy.load('en')
neuralcoref.add_to_pipe(coref_nlp)
nlp = spacy.load('en_core_web_sm')

# Extract word vectors


def My_Page_Rank(sentences):
    sentence_vectors = []
    lexArticle = dict()
    for sentence in sentences:
        if len(sentence) != 0:
            v = sum([
                word_embeddings.get(w, np.zeros((300, )))
                for w in sentence.split()
            ]) / (len(sentence.split()) + 0.001)
        else:
            v = np.zeros((300, ))
        sentence_vectors.append(v)