def load_data(row): q1 = parser(str(row[0])) q2 = parser(str(row[1])) set_ent1 = set([ele.lemma_ for ele in q1.ents]) set_ent2 = set([ele.lemma_ for ele in q2.ents]) num_full_match, num_part_match, rate_full_match, rate_part_match = feat2( set_ent1, set_ent2) set_svo1 = set([(ele[0].lower(), ele[1].lower(), ele[2].lower()) for ele in findSVOs(q1)]) set_svo2 = set([(ele[0].lower(), ele[1].lower(), ele[2].lower()) for ele in findSVOs(q2)]) set_svo1 = set([(wnl.lemmatize(ele[0]), wnl.lemmatize(ele[1]), wnl.lemmatize(ele[2])) for ele in set_svo1]) set_svo2 = set([(wnl.lemmatize(ele[0]), wnl.lemmatize(ele[1]), wnl.lemmatize(ele[2])) for ele in set_svo2]) set_vo1 = set((ele[1], ele[2]) for ele in set_svo1) set_vo2 = set((ele[1], ele[2]) for ele in set_svo2) num_vo, val_vo, rate_vo = feat(set_vo1, set_vo2) return [ num_full_match, num_part_match, rate_full_match, rate_part_match, num_vo, val_vo, rate_vo ]
def nlp_parse(q1, q2=None): token1 = [] lemma1 = [] pos1 = [] tag1 = [] dep1 = [] # shape = [] alpha1 = [] stop1 = [] doc1 = nlp(unicode(q1)) for w in doc1: token1.append(w.text) lemma1.append(w.lemma_) pos1.append(w.pos_) tag1.append(w.tag_) dep1.append(w.dep_) # shape1.append(w.shape_) alpha1.append(w.is_alpha) stop1.append(w.is_stop) word_cnt1 = len(token1) svo1 = findSVOs(doc1) ents1 = [(e.label_, e.text) for e in doc1.ents] alpha1_cnt = sum(alpha1) stop1_cnt = sum(stop1) svo1_len = len(svo1) if q2 is None: return token1, lemma1, pos1, tag1, dep1, stop1, word_cnt1, svo1, ents1, alpha1_cnt, stop1_cnt, svo1_len doc2 = nlp(unicode(q2)) doc_similarity = doc1.similarity(doc2) token2 = [] lemma2 = [] pos2 = [] tag2 = [] dep2 = [] # shape2 = [] alpha2 = [] stop2 = [] for w in doc2: token2.append(w.text) lemma2.append(w.lemma_) pos2.append(w.pos_) tag2.append(w.tag_) dep2.append(w.dep_) # shape2.append(w.shape_) alpha2.append(w.is_alpha) stop2.append(w.is_stop) word_cnt2 = len(token2) svo2 = findSVOs(doc2) ents2 = [(e.label_, e.text) for e in doc2.ents] alpha2_cnt = sum(alpha2) stop1_cnt = sum(stop2) svo1_len = len(svo2) return token1, lemma1, pos1, tag1, dep1, stop1, word_cnt1, svo1, ents1, alpha1_cnt, stop1_cnt, svo1_len, \ token2, lemma2, pos2, tag2, dep2, stop2, word_cnt2, svo2, ents2, alpha1_cnt, stop1_cnt, svo1_len, \ doc_similarity
def old_get_svos(): svos = {} doc = nlp(u'' + request.args.get('sentence')) svos = findSVOs(doc) printDeps(doc) print(svos) return json.dumps(svos)
def avg_clauses_per_sent(parser, child_sents): """ Returns the average number of clauses per sentence in a text :param parser: English parser object from spacy :param child_sents: list of sentences :return: returns the average number of clausses per sentence from the list :rtype: float """ sum = 0 for i in child_sents: parse = parser(' '.join(i)) sum += len(findSVOs(parse)) return sum / len(child_sents)
def subjectObjectExtraction(self, sentence): parse = self.parser(sentence) return findSVOs(parse)
def extract_subject(self): "extract the subject and objects from the text." from subject_object_extraction import findSVOs return findSVOs(self.oDoc)
def nlp_parse2(q1, q2=None): token1 = [] lemma1 = [] pos1 = [] tag1 = [] dep1 = [] # shape = [] alpha1 = [] stop1 = [] doc1 = nlp(unicode(q1)) for w in doc1: token1.append(w.text) lemma1.append(w.lemma_) pos1.append(w.pos_) tag1.append(w.tag_) dep1.append(w.dep_) # shape1.append(w.shape_) alpha1.append(w.is_alpha) stop1.append(w.is_stop) word_cnt1 = len(token1) svo1 = findSVOs(doc1) ents1 = [(e.label_, e.text) for e in doc1.ents] alpha1_cnt = sum(alpha1) stop1_cnt = sum(stop1) svo1_cnt = len(svo1) #svo_l = [" ".join(svo) for svo in svo1] # convert the svo #svo_str1 = " , ".join(svo_l) if q2 is None: return " ".join(token1), " ".join(lemma1), " ".join(pos1), " ".join(tag1), " ".join(dep1), \ word_cnt1, svo1, ents1, alpha1_cnt, stop1_cnt, svo1_cnt doc2 = nlp(unicode(q2)) doc_similarity = doc1.similarity(doc2) token2 = [] lemma2 = [] pos2 = [] tag2 = [] dep2 = [] # shape2 = [] alpha2 = [] stop2 = [] for w in doc2: token2.append(w.text) lemma2.append(w.lemma_) pos2.append(w.pos_) tag2.append(w.tag_) dep2.append(w.dep_) # shape2.append(w.shape_) alpha2.append(w.is_alpha) stop2.append(w.is_stop) word_cnt2 = len(token2) svo2 = findSVOs(doc2) ents2 = [(e.label_, e.text) for e in doc2.ents] alpha2_cnt = sum(alpha2) stop2_cnt = sum(stop2) svo2_cnt = len(svo2) #svo_l = [" ".join(svo) for svo in svo2] # convert the svo #svo_str2 = " , ".join(svo_l) return " ".join(token1), " ".join(lemma1), " ".join(pos1), " ".join(tag1), " ".join(dep1), \ word_cnt1, svo1, ents1, alpha1_cnt, stop1_cnt, svo1_cnt, \ " ".join(token2), " ".join(lemma2), " ".join(pos2), " ".join(tag2), " ".join(dep2), \ word_cnt2, svo2, ents2, alpha2_cnt, stop2_cnt, svo2_cnt, \ doc_similarity
for word in allWords[:10]: print(word.orth_) # Let's see if it can figure out this analogy # Man is to King as Woman is to ?? man = parser.vocab['father'] woman = parser.vocab['daughter'] result = king.vector - man.vector + woman.vector # gather all known words, take only the lowercased versions allWords = list({ w for w in parser.vocab if w.has_vector and w.orth_.islower() and w.lower_ != "king" and w.lower_ != "man" and w.lower_ != "woman" }) # sort by similarity to the result allWords.sort(key=lambda w: cosine(w.vector, result)) allWords.reverse() print( "\n----------------------------\nTop 3 closest results for king - man + woman:" ) for word in allWords[:3]: print(word.orth_) # can still work even without punctuation parse = parser("he and his brother drunk wine") print(findSVOs(parse)) token = nlp(u"he and his brother shot me and my sister") print(findSubs(token[0]))
#---------------------------------------- # Let's see if it can figure out this analogy # Man is to King as Woman is to ?? king = parser.vocab['king'] man = parser.vocab['man'] woman = parser.vocab['woman'] result = king.repvec - man.repvec + woman.repvec # gather all known words, take only the lowercased versions allWords = list({w for w in parser.vocab if w.has_repvec and w.orth_.islower() and w.lower_ != "king" and w.lower_ != "man" and w.lower_ != "woman"}) # sort by similarity to the result allWords.sort(key=lambda w: cosine(w.repvec, result)) allWords.reverse() print("\n----------------------------\nTop 3 closest results for king - man + woman:") for word in allWords[:3]: print(word.orth_) # it got it! Queen! #------------------------------ from subject_object_extraction import findSVOs # can still work even without punctuation parse = parser("he and his brother shot me and my sister") print(findSVOs(parse)) #------------------------------------ # very complex sample. Only some are correct. Some are missed. parse = parser("Far out in the uncharted backwaters of the unfashionable end of the Western Spiral arm of the Galaxy lies a small unregarded yellow sun.") print(findSVOs(parse)) #-------------------------------- #https://nicschrading.com/project/Intro-to-NLP-with-spaCy/
def load_data(row): lev_dist = Levenshtein.distance(str(row[0]).lower(), str(row[1]).lower()) jar_dist = jaro_distance(str(row[0]).lower(), str(row[1]).lower()) dam_dist = damerau_levenshtein_distance( str(row[0]).lower(), str(row[1]).lower()) q1 = parser(str(row[0])) q2 = parser(str(row[1])) set_ent1 = set([ele.label_.lower() for ele in q1.ents]) set_ent2 = set([ele.label_.lower() for ele in q2.ents]) num_ent, val_ent, rate_ent = feat(set_ent1, set_ent2) set_ent1 = set([' '.join(t.orth_ for t in ele) for ele in q1.ents]) set_ent2 = set([' '.join(t.orth_ for t in ele) for ele in q2.ents]) num_ent2, val_ent2, rate_ent2 = feat(set_ent1, set_ent2) list_last1 = [ele.lower_ for ele in q1 if ele.pos_ != 'PUNCT'] list_last2 = [ele.lower_ for ele in q2 if ele.pos_ != 'PUNCT'] num_for = 0 val_for = 0. for i in range(min(len(list_last1), len(list_last2))): if list_last1[i] == list_last2[i] or match_rating_comparison( list_last1[i], list_last2[i]): num_for += 1 val_for += weights.get(list_last1[i], 0) else: break list_last1.reverse() list_last2.reverse() num_clean2_rev = 0 val_clean2_rev = 0. for i in range(min(len(list_last1), len(list_last2))): if list_last1[i] == list_last2[i] or match_rating_comparison( list_last1[i], list_last2[i]): num_clean2_rev += 1 val_clean2_rev += weights.get(list_last1[i], 0) else: break set_sub1 = set([ele.lower_ for ele in q1 if ele.dep_ == 'nsubj']) set_sub2 = set([ele.lower_ for ele in q2 if ele.dep_ == 'nsubj']) num_sub, val_sub, rate_sub = feat(set_sub1, set_sub2) set_root1 = set([ele.lower_ for ele in q1 if ele.dep_ == 'ROOT']) set_root2 = set([ele.lower_ for ele in q2 if ele.dep_ == 'ROOT']) num_root, val_root, rate_root = feat(set_root1, set_root2) set_advmod1 = set([ele.lower_ for ele in q1 if ele.dep_ == 'advmod']) set_advmod2 = set([ele.lower_ for ele in q2 if ele.dep_ == 'advmod']) num_advmod, val_advmod, rate_advmod = feat(set_advmod1, set_advmod2) set_advcl1 = set([ele.lower_ for ele in q1 if ele.dep_ == 'advcl']) set_advcl2 = set([ele.lower_ for ele in q2 if ele.dep_ == 'advcl']) num_advcl, val_advcl, rate_advcl = feat(set_advcl1, set_advcl2) set_aux1 = set([ele.lower_ for ele in q1 if ele.dep_ == 'aux']) set_aux2 = set([ele.lower_ for ele in q2 if ele.dep_ == 'aux']) num_aux, val_aux, rate_aux = feat(set_aux1, set_aux2) set_dobj1 = set([ele.lower_ for ele in q1 if ele.dep_ == 'dobj']) set_dobj2 = set([ele.lower_ for ele in q2 if ele.dep_ == 'dobj']) num_dobj, val_dobj, rate_dobj = feat(set_dobj1, set_dobj2) # set_poss1 = set([ele.lower_ for ele in q1 if ele.dep_ == 'poss']) # set_poss2 = set([ele.lower_ for ele in q2 if ele.dep_ == 'poss']) # num_poss, val_poss, rate_poss = feat(set_poss1, set_poss2) set_noun1 = set([ele.lower_ for ele in q1 if ele.pos_ == 'NOUN']) set_noun2 = set([ele.lower_ for ele in q2 if ele.pos_ == 'NOUN']) num_noun, val_noun, rate_noun = feat(set_noun1, set_noun2) set_verb1 = set([ele.lower_ for ele in q1 if ele.pos_ == 'VERB']) set_verb2 = set([ele.lower_ for ele in q2 if ele.pos_ == 'VERB']) num_verb, val_verb, rate_verb = feat(set_verb1, set_verb2) set_adv1 = set([ele.lower_ for ele in q1 if ele.pos_ == 'ADV']) set_adv2 = set([ele.lower_ for ele in q2 if ele.pos_ == 'ADV']) num_adv, val_adv, rate_adv = feat(set_adv1, set_adv2) # set_adj1 = set([ele.lower_ for ele in q1 if ele.pos_ == 'ADJ']) # set_adj2 = set([ele.lower_ for ele in q2 if ele.pos_ == 'ADJ']) # num_adj, val_adj, rate_adj = feat(set_adj1, set_adj2) set_svo1 = set([(ele[0].lower(), ele[1].lower(), ele[2].lower()) for ele in findSVOs(q1)]) set_svo2 = set([(ele[0].lower(), ele[1].lower(), ele[2].lower()) for ele in findSVOs(q2)]) set_svo1 = set([(wnl.lemmatize(ele[0]), wnl.lemmatize(ele[1]), wnl.lemmatize(ele[2])) for ele in set_svo1]) set_svo2 = set([(wnl.lemmatize(ele[0]), wnl.lemmatize(ele[1]), wnl.lemmatize(ele[2])) for ele in set_svo2]) num_svo, val_svo, rate_svo = feat(set_svo1, set_svo2) set_s1 = set(ele[0] for ele in set_svo1) set_v1 = set(ele[1] for ele in set_svo1) set_o1 = set(ele[2] for ele in set_svo1) set_s2 = set(ele[0] for ele in set_svo2) set_v2 = set(ele[1] for ele in set_svo2) set_o2 = set(ele[2] for ele in set_svo2) num_s, val_s, rate_s = feat(set_s1, set_s2) num_v, val_v, rate_v = feat(set_v1, set_v2) num_o, val_o, rate_o = feat(set_o1, set_o2) list_ret = [ num_ent, num_ent2, num_clean2_rev, num_for, lev_dist, jar_dist, dam_dist, num_sub, num_root, num_advmod, num_advcl, num_aux, # num_poss, num_noun, num_verb, num_adv, # num_adj, num_svo, num_s, num_v, num_o ] list_ret += [ val_ent, val_ent2, val_clean2_rev, val_for, val_sub, val_root, val_advmod, val_advcl, val_aux, val_dobj, # val_poss, val_noun, val_verb, val_adv, # val_adj, val_svo, val_s, val_v, val_o ] list_ret += [ rate_ent, rate_ent2, rate_sub, rate_root, rate_advmod, rate_advcl, rate_aux, rate_dobj, # rate_poss, rate_noun, rate_verb, rate_adv, # rate_adj, rate_svo, rate_s, rate_v, rate_o ] return list_ret
def extract_entities(self): print 'in extract entities' sentence_parse = Sentence.SPACY_PARSER(self.m_sentence_text) spacy_subj = None temp_pobj = None for token in sentence_parse: token_dep = token.dep_ print(token.orth_, token.dep_, token.head.orth_, [t.orth_ for t in token.lefts], [t.orth_ for t in token.rights]) if token_dep == 'pobj': temp_pobj = token elif token_dep == 'nsubj' or token_dep == 'nsubjpass': spacy_subj = token.orth_.lower() elif token_dep == 'poss': self.assign_poss_entities(token) elif token_dep == 'compound' or token_dep == 'amod': print 'in compound and amod case' modifier = Sentence.LEMMATIZER_MODULE.lemmatize(token.orth_) compound_dobj = Sentence.LEMMATIZER_MODULE.lemmatize( token.head.orth_) compound_modifier = CompoundModifier(modifier, compound_dobj) print 'found compound modifier:', modifier, compound_dobj self.m_compound_modifiers.append(compound_modifier) self.m_complex_nouns.append(modifier + " " + compound_dobj) # self.temp_dobj = compound_dobj sentence_svos = findSVOs(sentence_parse) print "svos", sentence_svos, len(sentence_svos) if len(sentence_svos) > 0: transfer_entity_relation = None # #print 'starts with an expl:',self.m_is_first_word_an_expletive if self.m_is_first_word_an_expletive == False: print 'svo' print sentence_svos[0][0] print sentence_svos[0][2] # trying to assign subj and obj from svo self.assign_nsubj(sentence_svos[0][0]) self.assign_dobj(sentence_svos[0][2]) print 'after trying to assign subj', self.m_nsubj print 'after trying to assign dobj:' print 'dobj exists?:', self.m_has_a_dobj print 'dobj:', self.m_dobj print 'temp dobj:', self.temp_dobj #print temp_pobj if self.m_has_a_dobj == False: if self.temp_dobj != None: print 'before temp dobj' self.assign_dobj(self.temp_dobj) if self.temp_transfer_entity != None: self.assign_transfer_entity( self.temp_transfer_entity, 'dobj') elif temp_pobj != None: print 'before temp pobj' self.assign_dobj(temp_pobj.orth_.lower()) #self.assign_dobj(self.m_pobj, 'pobj') self.assign_transfer_entity(sentence_svos[0][2], 'dobj') elif temp_pobj != None: print 'in temp dobj != None' self.assign_transfer_entity(temp_pobj.orth_.lower(), 'pobj') elif self.temp_transfer_entity != None: print 'in temp transfer entity !- None' self.assign_transfer_entity(self.temp_transfer_entity, 'poss') else: # #print 'before 2nsd svo' self.assign_dobj(sentence_svos[0][2]) if temp_pobj != None: self.assign_nsubj(temp_pobj.orth_.lower()) ###print 'before calling extract quantified' self.extract_quantified_entities(True, transfer_entity_relation) elif spacy_subj != None and temp_pobj != None: self.temp_dobj = temp_pobj.orth_ print 'In spacy' #print self.temp_dobj self.assign_nsubj(spacy_subj) self.assign_dobj(self.temp_dobj) self.extract_quantified_entities(False, None) elif spacy_subj != None and self.m_question.m_question_label != 'c': # print 'spacy_subj is not none' self.assign_dobj(spacy_subj) self.extract_quantified_entities(False, None) elif self.m_question.m_question_label == 'c': if self.m_has_a_cardinal: print 'found nothing should do something.' quantified_non_entity = QuantifiedNonEntity(self.m_cardinal) if spacy_subj != None: self.assign_nsubj(spacy_subj) quantified_non_entity.set_owner_entity(self.m_owner_entity) self.m_question.add_quantified_non_entity( quantified_non_entity)