def pronounResolution(doc): coref = Coref() clusters = coref.one_shot_coref( utterances=u"Their parents love them very much.", context=u"I kown a twins") resolved_utterance_text = coref.get_resolved_utterances() print(resolved_utterance_text)
def rnn(filename): bucket_name = "novelnet" coref = Coref() print("job " + filename + " is started") s3 = boto3.resource('s3') s3.Object("novelnet", filename).download_file("test") with open("test", 'r') as f: text = f.read() #print(text) relationship = Relationship(id=filename, pipeline=coref, text=text, threshold=20, verbose=False) relationship.report() doc_name = 'doc' + filename + '.pkl' clusters_name = 'clusters' + filename + '.pkl' mentions_name = 'mentions' + filename + '.pkl' ner_name = 'ner' + filename + '.txt' s3.Object(bucket_name, "results/" + doc_name).upload_file(doc_name) s3.Object(bucket_name, "results/" + clusters_name).upload_file(clusters_name) s3.Object(bucket_name, "results/" + mentions_name).upload_file(mentions_name) s3.Object(bucket_name, "results/" + ner_name).upload_file(ner_name) os.remove("status") os.remove("test") print("job " + filename + " is finished")
def resolve_corefs(text): print("resolving pronoun co-refs...") document = nlp(text) with HiddenPrints(): coref = Coref() context = "" for sentence in document: # print(str(sentence)) if "they " in str(sentence): context += " " + str(sentence).strip() continue clusters = coref.one_shot_coref(utterances=str(sentence).strip(), context=context) resolved_utterance_text = coref.get_resolved_utterances() # print(resolved_utterance_text) context += " ".join(resolved_utterance_text).strip() return context
def main(): with open('data/pre-nlp/reviews.json') as fr: reviews = json.load(fr) coref = Coref() progress = 1 coref_reviews = [] for review in reviews: text = review['text'] try: coref.one_shot_coref(utterances=text) except Exception as e: print e.message continue review['text'] = coref.get_resolved_utterances()[0] coref_reviews.append(review.copy()) print progress progress += 1 with open('data/pre-nlp/reviews-after-coreference.json', 'w') as fw: fw.write(json.dumps(coref_reviews))
def extractRel(raw_text, endpoint): temppath = 'StanfordOpenIE/' tempfile = 'input.txt' text = raw_text.lower() logger.info(text) coref = Coref() clusters = coref.one_shot_coref(utterances=unicode(text)) resolved_coref = coref.get_most_representative() mentions = coref.get_clusters() resolved_coref = {str(k): str(v) for k, v in resolved_coref.items()} for key, val in resolved_coref.items(): text = re.sub( str(' ') + key + str(' '), str(' ') + str(val) + str(' '), text) relation_tuples = list() concept_pairs = conceptsimilarity.filter_pairs(text, endpoint) for concept1, concept2 in concept_pairs: sentences = text.split('.') for sentence in sentences: if concept1 in sentence and concept2 in sentence: with open(temppath + tempfile, 'w') as f: f.write(sentence) f.close() for sub, rel, pred in conceptmap.relations(tempfile): logger.info("concept1 %s", concept1) logger.info("concept2 %s", concept2) logger.info("sub %s", sub) logger.info("rel %s", rel) logger.info("pred %s", pred) # Here we are selecting the relations that appear near to each other in the paragraph # For now, concepts that lie in the same sentence and are related in openie are selected if ((concept1 in sub) and (concept2 in pred)) or ((concept1 in pred) and (concept2 in sub)): relation_tuples.append((sub, rel, pred)) return relation_tuples
def doCoref(text): # Initialize Coref object and counters coref = Coref() results = resolve(text, coref) # fd = open("corefoutput.txt", "w") # for y in results: # fd.write(str(y[0]) + "\n") # fd.close() return results
def process_text(path='Harry_Potter_and_the_Sorcerers_Stone.txt'): coref = Coref() with open("Harry_Potter_and_the_Sorcerers_Stone.txt", 'r') as f: text = f.read() relationship = RelationshipGolden(0, pipeline=coref, text=text, threshold=25, verbose=False) relationship.build_relationship() relationship.report() relationship.export_graph()
def resolve_pronouns(doc): coref = Coref() coref.one_shot_coref(utterances=doc.text) mentions = coref.get_mentions() #print(mentions, coref.get_scores()) clusters = coref.get_clusters(remove_singletons=True) alias_groups = [] for cluster in clusters[0].values(): # cluster here is a list of mention indices aliases = [] indices = [] for mention_index in cluster: mention = mentions[mention_index] aliases.append(mention.text) indices.append((mention.start, mention.end)) alias_groups.append((aliases, indices)) return alias_groups
from neuralcoref import Coref coref = Coref() clusters = coref.continuous_coref(utterances=u"John wanted to marry Mary. He was a doctor and she was a nurse. She went to the market one day when Henry met her and proposed her. He was a local don.") print(clusters) mentions = coref.get_mentions() #print(mentions) utterances = coref.get_utterances() print(utterances) resolved_utterance_text = coref.get_resolved_utterances() print(resolved_utterance_text) print(coref.get_scores())
sentences=paragraph.split('.') for item in sentences[:1]: print item ''' ''' COREFERENCE RESOLUTION ''' con = "lamp on table" utt = "chair near to table" paragraph = "there is a table.there is a lamp on it.it has white colour" sentences = paragraph.split('.') coref = Coref() clusters = coref.one_shot_coref(utterances=unicode(utt), context=unicode(con)) a = coref.get_mentions() first = str(a[0]) one = [] for item in a: one.append(str(item)) two = first.split() utterances = coref.get_utterances() resolved_utterance_text = coref.get_resolved_utterances() print(resolved_utterance_text) diff = list(set(two) - set(one))
from neuralcoref import Coref coref = Coref() clusters = coref.one_shot_coref(utterances=u"She loves him.", context=u"My sister has a dog.") mentions = coref.get_mentions() utterances = coref.get_utterances() resolved_utterance_text = coref.get_resolved_utterances() most_representative = coref.get_most_representative() print(clusters) print(mentions) print(utterances) print(resolved_utterance_text) print(most_representative)
# -*- coding: utf-8 -*- from nltk.corpus import wordnet as wn import numpy as np import data_helpers import auxilary_data_helper from tokenizeData import Lemmatizer import itertools import pickle from keras.models import load_model from neuralcoref import Coref import sent2vec sent2vec_model = sent2vec.Sent2vecModel() sent2vec_model.load_model('torontobooks_unigrams.bin') coref = Coref() def evaluate_on_test(x_test, y_test, model): evaluate = model.evaluate(x_test, y_test) print("Evaluated against test dataset: " + evaluate) def predict_movie(testStrs, model, labels_dict, vocabulary, event_voc, ners_voc, multiple=False): #testStrs = ["A young girl rises to fame in Broadway while many other theatre figures are jealous and disgusted. She starts as a general assistant to another bright star of Broadway and slowly she becomes the new star in the town. Another highschool girl offers to help and becomes a maid to Eve."]
def extract_triplets(sentence): nlp = spacy.load('en') # tok = nlp("John is a doctor and Mary is a nurse") # svos = find_triplets(tok) # #print(svos) # make all occupation words to lower # for higher accuracy words = sentence.split() cleaned_words = [] for word in words: cleaned_word = word check_word = word.strip(",").strip(";").strip(".") if is_occupation(check_word.lower()): cleaned_word = cleaned_word.lower() cleaned_words.append(cleaned_word) sentence = " ".join(cleaned_words) #print(sentence) ne_tree = nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sentence))) iob_tagged = nltk.tree2conlltags(ne_tree) #print(iob_tagged) male_counter = 0 female_counter = 0 mappings = {} for name in ENGLISH_MALE: mappings[name] = [] for name in ENGLISH_FEMALE: mappings[name] = [] mappings[DEFAULT] = [] b_persons = [] for ent in iob_tagged: if ent[1] == 'NNP': #print(is_male(ent[0].lower())) if is_male(ent[0].lower()) and male_counter < len(ENGLISH_MALE): sentence = sentence.replace(ent[0], ENGLISH_MALE[male_counter].capitalize()) mappings[ENGLISH_MALE[male_counter]].append(ent[0]) male_counter+=1 elif is_female(ent[0].lower()) and female_counter < len(ENGLISH_FEMALE): sentence = sentence.replace(ent[0], ENGLISH_FEMALE[female_counter].capitalize()) mappings[ENGLISH_FEMALE[female_counter]].append(ent[0]) female_counter+=1 else: sentence = sentence.replace(ent[0], DEFAULT) mappings[DEFAULT].append(ent[0]) coref = Coref() clusters = coref.continuous_coref(utterances=sentence) sentence = coref.get_resolved_utterances()[0] print(sentence) tok = nlp(sentence) svos = find_triplets(tok) count = 0 new_svos = [] for triplet in svos: if triplet[0].capitalize() not in b_persons: if triplet[0] != DEFAULT: if triplet[0] in mappings.keys(): popped = triplet addition = (mappings[popped[0]][0], popped[1], popped[2]) new_svos.append(addition) else: popped = triplet addition = (mappings[popped[0][count]], popped[1], popped[2]) new_svos.append(addition) count += 1 else: new_svos.append(triplet) return new_svos
prev_noun = resolve_noun break return doc_list sentence = "Louie is a quite fellow." \ " But that doesn't mean he will endure anything." \ " Samantha loves this about him." \ " Why wouldn't she?" \ " Her whole childhood was under his shadow." \ " John admired him, but he didn't know him, like she knew him." \ " Louie used to say 'I know life'." \ " He did." coref = Coref() cluster = coref.continuous_coref(utterances=sentence) resolved_sentence = coref.get_resolved_utterances() print(resolved_sentence) """en_nlp = spacy.load('en_core_web_md') # sentence = sentence.lower() print(sentence) en_doc = en_nlp(u'' + sentence) prop_noun = "" anaphora_mappings = {} prop_noun_entities, prop_noun_entities_pos = get_named_entities(en_doc) # prop_noun_entities = get_noun_chunks(en_doc, prop_noun_entities) for entity in prop_noun_entities.keys(): anaphora_mappings = map_entity_pronoun(prop_noun_entities, entity, anaphora_mappings)
from sklearn.model_selection import train_test_split from sklearn.preprocessing import MultiLabelBinarizer from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer from sklearn.pipeline import Pipeline from sklearn.naive_bayes import MultinomialNB from skmultilearn.problem_transform import LabelPowerset # nlp libraries/api import en_core_web_lg from spacy import displacy import gensim from neuralcoref import Coref spacy = en_core_web_lg.load() coref = Coref(nlp=spacy) # Load opinion lexicon neg_file = open("../neg_words.txt",encoding = "ISO-8859-1") pos_file = open("../pos_words.txt",encoding = "ISO-8859-1") neg = [line.strip() for line in neg_file.readlines()] pos = [line.strip() for line in pos_file.readlines()] opinion_words = neg + pos # Setup nltk corpora path and Google Word2Vec location # google_vec_file = 'GoogleNews-vectors-negative300.bin' # word2vec = gensim.models.KeyedVectors.load_word2vec_format(google_vec_file, binary=True) # pickle.dump(word2vec, open("word2vec_google.pkl", 'wb')) word2vec = pickle.load(open("../word2vec_google.pkl", 'rb')) # load the Multi-label binarizer