Beispiel #1
0
def load_data(row):

    q1 = parser(str(row[0]))
    q2 = parser(str(row[1]))

    set_ent1 = set([ele.lemma_ for ele in q1.ents])
    set_ent2 = set([ele.lemma_ for ele in q2.ents])
    num_full_match, num_part_match, rate_full_match, rate_part_match = feat2(
        set_ent1, set_ent2)

    set_svo1 = set([(ele[0].lower(), ele[1].lower(), ele[2].lower())
                    for ele in findSVOs(q1)])
    set_svo2 = set([(ele[0].lower(), ele[1].lower(), ele[2].lower())
                    for ele in findSVOs(q2)])

    set_svo1 = set([(wnl.lemmatize(ele[0]), wnl.lemmatize(ele[1]),
                     wnl.lemmatize(ele[2])) for ele in set_svo1])
    set_svo2 = set([(wnl.lemmatize(ele[0]), wnl.lemmatize(ele[1]),
                     wnl.lemmatize(ele[2])) for ele in set_svo2])

    set_vo1 = set((ele[1], ele[2]) for ele in set_svo1)
    set_vo2 = set((ele[1], ele[2]) for ele in set_svo2)
    num_vo, val_vo, rate_vo = feat(set_vo1, set_vo2)

    return [
        num_full_match, num_part_match, rate_full_match, rate_part_match,
        num_vo, val_vo, rate_vo
    ]
Beispiel #2
0
def nlp_parse(q1, q2=None):
    token1 = []
    lemma1 = []
    pos1 = []
    tag1 = []
    dep1 = []
    #     shape = []
    alpha1 = []
    stop1 = []
    doc1 = nlp(unicode(q1))
    for w in doc1:
        token1.append(w.text)
        lemma1.append(w.lemma_)
        pos1.append(w.pos_)
        tag1.append(w.tag_)
        dep1.append(w.dep_)
        #         shape1.append(w.shape_)
        alpha1.append(w.is_alpha)
        stop1.append(w.is_stop)
    word_cnt1 = len(token1)
    svo1 = findSVOs(doc1)
    ents1 = [(e.label_, e.text) for e in doc1.ents]
    alpha1_cnt = sum(alpha1)
    stop1_cnt = sum(stop1)
    svo1_len = len(svo1)
    if q2 is None:
        return token1, lemma1, pos1, tag1, dep1, stop1, word_cnt1, svo1, ents1, alpha1_cnt, stop1_cnt, svo1_len

    doc2 = nlp(unicode(q2))
    doc_similarity = doc1.similarity(doc2)

    token2 = []
    lemma2 = []
    pos2 = []
    tag2 = []
    dep2 = []
    #     shape2 = []
    alpha2 = []
    stop2 = []
    for w in doc2:
        token2.append(w.text)
        lemma2.append(w.lemma_)
        pos2.append(w.pos_)
        tag2.append(w.tag_)
        dep2.append(w.dep_)
        #         shape2.append(w.shape_)
        alpha2.append(w.is_alpha)
        stop2.append(w.is_stop)
    word_cnt2 = len(token2)
    svo2 = findSVOs(doc2)
    ents2 = [(e.label_, e.text) for e in doc2.ents]

    alpha2_cnt = sum(alpha2)
    stop1_cnt = sum(stop2)
    svo1_len = len(svo2)

    return  token1, lemma1, pos1, tag1, dep1, stop1, word_cnt1, svo1, ents1, alpha1_cnt, stop1_cnt, svo1_len, \
                token2, lemma2, pos2, tag2, dep2, stop2, word_cnt2, svo2, ents2, alpha1_cnt, stop1_cnt, svo1_len, \
                doc_similarity
def old_get_svos():
    svos = {}
    doc = nlp(u'' + request.args.get('sentence'))
    svos = findSVOs(doc)
    printDeps(doc)
    print(svos)
    return json.dumps(svos)
Beispiel #4
0
def avg_clauses_per_sent(parser, child_sents):
    """
    Returns the average number of clauses per sentence in a text

    :param parser: English parser object from spacy
    :param child_sents: list of sentences
    :return: returns the average number of clausses per sentence from the list
    :rtype: float
    """
    sum = 0
    for i in child_sents:
        parse = parser(' '.join(i))
        sum += len(findSVOs(parse))
    return sum / len(child_sents)
 def subjectObjectExtraction(self, sentence):
     parse = self.parser(sentence)
     return findSVOs(parse)
Beispiel #6
0
 def extract_subject(self):
     "extract the subject and objects from the text."
     from subject_object_extraction import findSVOs
     return findSVOs(self.oDoc)
Beispiel #7
0
def nlp_parse2(q1, q2=None):
    token1 = []
    lemma1 = []
    pos1 = []
    tag1 = []
    dep1 = []
    #     shape = []
    alpha1 = []
    stop1 = []
    doc1 = nlp(unicode(q1))
    for w in doc1:
        token1.append(w.text)
        lemma1.append(w.lemma_)
        pos1.append(w.pos_)
        tag1.append(w.tag_)
        dep1.append(w.dep_)
        #         shape1.append(w.shape_)
        alpha1.append(w.is_alpha)
        stop1.append(w.is_stop)
    word_cnt1 = len(token1)
    svo1 = findSVOs(doc1)
    ents1 = [(e.label_, e.text) for e in doc1.ents]
    alpha1_cnt = sum(alpha1)
    stop1_cnt = sum(stop1)
    svo1_cnt = len(svo1)
    #svo_l = [" ".join(svo) for svo in svo1] # convert the svo
    #svo_str1 = " , ".join(svo_l)
    if q2 is None:
        return " ".join(token1), " ".join(lemma1), " ".join(pos1), " ".join(tag1), " ".join(dep1), \
               word_cnt1, svo1, ents1, alpha1_cnt, stop1_cnt, svo1_cnt

    doc2 = nlp(unicode(q2))
    doc_similarity = doc1.similarity(doc2)

    token2 = []
    lemma2 = []
    pos2 = []
    tag2 = []
    dep2 = []
    #     shape2 = []
    alpha2 = []
    stop2 = []
    for w in doc2:
        token2.append(w.text)
        lemma2.append(w.lemma_)
        pos2.append(w.pos_)
        tag2.append(w.tag_)
        dep2.append(w.dep_)
        #         shape2.append(w.shape_)
        alpha2.append(w.is_alpha)
        stop2.append(w.is_stop)
    word_cnt2 = len(token2)
    svo2 = findSVOs(doc2)
    ents2 = [(e.label_, e.text) for e in doc2.ents]

    alpha2_cnt = sum(alpha2)
    stop2_cnt = sum(stop2)
    svo2_cnt = len(svo2)
    #svo_l = [" ".join(svo) for svo in svo2] # convert the svo
    #svo_str2 = " , ".join(svo_l)
    return  " ".join(token1), " ".join(lemma1), " ".join(pos1), " ".join(tag1), " ".join(dep1), \
            word_cnt1, svo1, ents1, alpha1_cnt, stop1_cnt, svo1_cnt, \
            " ".join(token2), " ".join(lemma2), " ".join(pos2), " ".join(tag2), " ".join(dep2), \
            word_cnt2, svo2, ents2, alpha2_cnt, stop2_cnt, svo2_cnt, \
            doc_similarity
Beispiel #8
0
 def extract_subject(self):
     "extract the subject and objects from the text."
     from subject_object_extraction import findSVOs
     return findSVOs(self.oDoc)
Beispiel #9
0
for word in allWords[:10]:
    print(word.orth_)

# Let's see if it can figure out this analogy
# Man is to King as Woman is to ??
man = parser.vocab['father']
woman = parser.vocab['daughter']
result = king.vector - man.vector + woman.vector

# gather all known words, take only the lowercased versions
allWords = list({
    w
    for w in parser.vocab if w.has_vector and w.orth_.islower()
    and w.lower_ != "king" and w.lower_ != "man" and w.lower_ != "woman"
})
# sort by similarity to the result
allWords.sort(key=lambda w: cosine(w.vector, result))
allWords.reverse()
print(
    "\n----------------------------\nTop 3 closest results for king - man + woman:"
)
for word in allWords[:3]:
    print(word.orth_)

# can still work even without punctuation
parse = parser("he and his brother drunk wine")
print(findSVOs(parse))

token = nlp(u"he and his brother shot me and my sister")
print(findSubs(token[0]))
#----------------------------------------
# Let's see if it can figure out this analogy
# Man is to King as Woman is to ??
king = parser.vocab['king']
man = parser.vocab['man']
woman = parser.vocab['woman']

result = king.repvec - man.repvec + woman.repvec

# gather all known words, take only the lowercased versions
allWords = list({w for w in parser.vocab if w.has_repvec and w.orth_.islower() and w.lower_ != "king" and w.lower_ != "man" and w.lower_ != "woman"})
# sort by similarity to the result
allWords.sort(key=lambda w: cosine(w.repvec, result))
allWords.reverse()
print("\n----------------------------\nTop 3 closest results for king - man + woman:")
for word in allWords[:3]:   
    print(word.orth_)
    
# it got it! Queen!
#------------------------------
from subject_object_extraction import findSVOs

# can still work even without punctuation
parse = parser("he and his brother shot me and my sister")
print(findSVOs(parse))
#------------------------------------
# very complex sample. Only some are correct. Some are missed.
parse = parser("Far out in the uncharted backwaters of the unfashionable end of the Western Spiral arm of the Galaxy lies a small unregarded yellow sun.")
print(findSVOs(parse))
#--------------------------------
#https://nicschrading.com/project/Intro-to-NLP-with-spaCy/
Beispiel #11
0
def load_data(row):

    lev_dist = Levenshtein.distance(str(row[0]).lower(), str(row[1]).lower())
    jar_dist = jaro_distance(str(row[0]).lower(), str(row[1]).lower())
    dam_dist = damerau_levenshtein_distance(
        str(row[0]).lower(),
        str(row[1]).lower())

    q1 = parser(str(row[0]))
    q2 = parser(str(row[1]))

    set_ent1 = set([ele.label_.lower() for ele in q1.ents])
    set_ent2 = set([ele.label_.lower() for ele in q2.ents])

    num_ent, val_ent, rate_ent = feat(set_ent1, set_ent2)

    set_ent1 = set([' '.join(t.orth_ for t in ele) for ele in q1.ents])
    set_ent2 = set([' '.join(t.orth_ for t in ele) for ele in q2.ents])

    num_ent2, val_ent2, rate_ent2 = feat(set_ent1, set_ent2)

    list_last1 = [ele.lower_ for ele in q1 if ele.pos_ != 'PUNCT']
    list_last2 = [ele.lower_ for ele in q2 if ele.pos_ != 'PUNCT']
    num_for = 0
    val_for = 0.
    for i in range(min(len(list_last1), len(list_last2))):
        if list_last1[i] == list_last2[i] or match_rating_comparison(
                list_last1[i], list_last2[i]):
            num_for += 1
            val_for += weights.get(list_last1[i], 0)
        else:
            break

    list_last1.reverse()
    list_last2.reverse()
    num_clean2_rev = 0
    val_clean2_rev = 0.
    for i in range(min(len(list_last1), len(list_last2))):
        if list_last1[i] == list_last2[i] or match_rating_comparison(
                list_last1[i], list_last2[i]):
            num_clean2_rev += 1
            val_clean2_rev += weights.get(list_last1[i], 0)
        else:
            break

    set_sub1 = set([ele.lower_ for ele in q1 if ele.dep_ == 'nsubj'])
    set_sub2 = set([ele.lower_ for ele in q2 if ele.dep_ == 'nsubj'])

    num_sub, val_sub, rate_sub = feat(set_sub1, set_sub2)

    set_root1 = set([ele.lower_ for ele in q1 if ele.dep_ == 'ROOT'])
    set_root2 = set([ele.lower_ for ele in q2 if ele.dep_ == 'ROOT'])

    num_root, val_root, rate_root = feat(set_root1, set_root2)

    set_advmod1 = set([ele.lower_ for ele in q1 if ele.dep_ == 'advmod'])
    set_advmod2 = set([ele.lower_ for ele in q2 if ele.dep_ == 'advmod'])

    num_advmod, val_advmod, rate_advmod = feat(set_advmod1, set_advmod2)

    set_advcl1 = set([ele.lower_ for ele in q1 if ele.dep_ == 'advcl'])
    set_advcl2 = set([ele.lower_ for ele in q2 if ele.dep_ == 'advcl'])

    num_advcl, val_advcl, rate_advcl = feat(set_advcl1, set_advcl2)

    set_aux1 = set([ele.lower_ for ele in q1 if ele.dep_ == 'aux'])
    set_aux2 = set([ele.lower_ for ele in q2 if ele.dep_ == 'aux'])

    num_aux, val_aux, rate_aux = feat(set_aux1, set_aux2)

    set_dobj1 = set([ele.lower_ for ele in q1 if ele.dep_ == 'dobj'])
    set_dobj2 = set([ele.lower_ for ele in q2 if ele.dep_ == 'dobj'])

    num_dobj, val_dobj, rate_dobj = feat(set_dobj1, set_dobj2)

    # set_poss1 = set([ele.lower_ for ele in q1 if ele.dep_ == 'poss'])
    # set_poss2 = set([ele.lower_ for ele in q2 if ele.dep_ == 'poss'])

    # num_poss, val_poss, rate_poss = feat(set_poss1, set_poss2)

    set_noun1 = set([ele.lower_ for ele in q1 if ele.pos_ == 'NOUN'])
    set_noun2 = set([ele.lower_ for ele in q2 if ele.pos_ == 'NOUN'])

    num_noun, val_noun, rate_noun = feat(set_noun1, set_noun2)

    set_verb1 = set([ele.lower_ for ele in q1 if ele.pos_ == 'VERB'])
    set_verb2 = set([ele.lower_ for ele in q2 if ele.pos_ == 'VERB'])

    num_verb, val_verb, rate_verb = feat(set_verb1, set_verb2)

    set_adv1 = set([ele.lower_ for ele in q1 if ele.pos_ == 'ADV'])
    set_adv2 = set([ele.lower_ for ele in q2 if ele.pos_ == 'ADV'])

    num_adv, val_adv, rate_adv = feat(set_adv1, set_adv2)

    # set_adj1 = set([ele.lower_ for ele in q1 if ele.pos_ == 'ADJ'])
    # set_adj2 = set([ele.lower_ for ele in q2 if ele.pos_ == 'ADJ'])
    # num_adj, val_adj, rate_adj = feat(set_adj1, set_adj2)

    set_svo1 = set([(ele[0].lower(), ele[1].lower(), ele[2].lower())
                    for ele in findSVOs(q1)])
    set_svo2 = set([(ele[0].lower(), ele[1].lower(), ele[2].lower())
                    for ele in findSVOs(q2)])

    set_svo1 = set([(wnl.lemmatize(ele[0]), wnl.lemmatize(ele[1]),
                     wnl.lemmatize(ele[2])) for ele in set_svo1])
    set_svo2 = set([(wnl.lemmatize(ele[0]), wnl.lemmatize(ele[1]),
                     wnl.lemmatize(ele[2])) for ele in set_svo2])

    num_svo, val_svo, rate_svo = feat(set_svo1, set_svo2)

    set_s1 = set(ele[0] for ele in set_svo1)
    set_v1 = set(ele[1] for ele in set_svo1)
    set_o1 = set(ele[2] for ele in set_svo1)

    set_s2 = set(ele[0] for ele in set_svo2)
    set_v2 = set(ele[1] for ele in set_svo2)
    set_o2 = set(ele[2] for ele in set_svo2)

    num_s, val_s, rate_s = feat(set_s1, set_s2)

    num_v, val_v, rate_v = feat(set_v1, set_v2)

    num_o, val_o, rate_o = feat(set_o1, set_o2)

    list_ret = [
        num_ent,
        num_ent2,
        num_clean2_rev,
        num_for,
        lev_dist,
        jar_dist,
        dam_dist,
        num_sub,
        num_root,
        num_advmod,
        num_advcl,
        num_aux,  # num_poss,
        num_noun,
        num_verb,
        num_adv,  # num_adj,
        num_svo,
        num_s,
        num_v,
        num_o
    ]
    list_ret += [
        val_ent,
        val_ent2,
        val_clean2_rev,
        val_for,
        val_sub,
        val_root,
        val_advmod,
        val_advcl,
        val_aux,
        val_dobj,  # val_poss,
        val_noun,
        val_verb,
        val_adv,  # val_adj,
        val_svo,
        val_s,
        val_v,
        val_o
    ]
    list_ret += [
        rate_ent,
        rate_ent2,
        rate_sub,
        rate_root,
        rate_advmod,
        rate_advcl,
        rate_aux,
        rate_dobj,  # rate_poss,
        rate_noun,
        rate_verb,
        rate_adv,  # rate_adj,
        rate_svo,
        rate_s,
        rate_v,
        rate_o
    ]

    return list_ret
Beispiel #12
0
    def extract_entities(self):
        print 'in extract entities'
        sentence_parse = Sentence.SPACY_PARSER(self.m_sentence_text)
        spacy_subj = None
        temp_pobj = None
        for token in sentence_parse:
            token_dep = token.dep_
            print(token.orth_, token.dep_, token.head.orth_,
                  [t.orth_
                   for t in token.lefts], [t.orth_ for t in token.rights])
            if token_dep == 'pobj':
                temp_pobj = token
            elif token_dep == 'nsubj' or token_dep == 'nsubjpass':
                spacy_subj = token.orth_.lower()
            elif token_dep == 'poss':
                self.assign_poss_entities(token)
            elif token_dep == 'compound' or token_dep == 'amod':
                print 'in compound and amod case'
                modifier = Sentence.LEMMATIZER_MODULE.lemmatize(token.orth_)
                compound_dobj = Sentence.LEMMATIZER_MODULE.lemmatize(
                    token.head.orth_)
                compound_modifier = CompoundModifier(modifier, compound_dobj)
                print 'found compound modifier:', modifier, compound_dobj
                self.m_compound_modifiers.append(compound_modifier)
                self.m_complex_nouns.append(modifier + " " + compound_dobj)
#                 self.temp_dobj = compound_dobj

        sentence_svos = findSVOs(sentence_parse)
        print "svos", sentence_svos, len(sentence_svos)
        if len(sentence_svos) > 0:
            transfer_entity_relation = None
            #             #print 'starts with an expl:',self.m_is_first_word_an_expletive
            if self.m_is_first_word_an_expletive == False:

                print 'svo'
                print sentence_svos[0][0]
                print sentence_svos[0][2]

                #                 trying to assign subj and obj from svo
                self.assign_nsubj(sentence_svos[0][0])
                self.assign_dobj(sentence_svos[0][2])

                print 'after trying to assign subj', self.m_nsubj
                print 'after trying to assign dobj:'
                print 'dobj exists?:', self.m_has_a_dobj
                print 'dobj:', self.m_dobj
                print 'temp dobj:', self.temp_dobj
                #print temp_pobj

                if self.m_has_a_dobj == False:
                    if self.temp_dobj != None:
                        print 'before temp dobj'
                        self.assign_dobj(self.temp_dobj)
                        if self.temp_transfer_entity != None:
                            self.assign_transfer_entity(
                                self.temp_transfer_entity, 'dobj')
                    elif temp_pobj != None:
                        print 'before temp pobj'
                        self.assign_dobj(temp_pobj.orth_.lower())
                        #self.assign_dobj(self.m_pobj, 'pobj')
                        self.assign_transfer_entity(sentence_svos[0][2],
                                                    'dobj')
                elif temp_pobj != None:
                    print 'in temp dobj != None'
                    self.assign_transfer_entity(temp_pobj.orth_.lower(),
                                                'pobj')
                elif self.temp_transfer_entity != None:
                    print 'in temp transfer entity !- None'
                    self.assign_transfer_entity(self.temp_transfer_entity,
                                                'poss')
            else:
                #                 #print 'before 2nsd svo'
                self.assign_dobj(sentence_svos[0][2])

                if temp_pobj != None:
                    self.assign_nsubj(temp_pobj.orth_.lower())
            ###print 'before calling extract quantified'
            self.extract_quantified_entities(True, transfer_entity_relation)
        elif spacy_subj != None and temp_pobj != None:
            self.temp_dobj = temp_pobj.orth_
            print 'In spacy'
            #print self.temp_dobj
            self.assign_nsubj(spacy_subj)
            self.assign_dobj(self.temp_dobj)
            self.extract_quantified_entities(False, None)
        elif spacy_subj != None and self.m_question.m_question_label != 'c':

            #             print 'spacy_subj is not none'
            self.assign_dobj(spacy_subj)
            self.extract_quantified_entities(False, None)

        elif self.m_question.m_question_label == 'c':
            if self.m_has_a_cardinal:
                print 'found nothing should do something.'
                quantified_non_entity = QuantifiedNonEntity(self.m_cardinal)
                if spacy_subj != None:
                    self.assign_nsubj(spacy_subj)
                    quantified_non_entity.set_owner_entity(self.m_owner_entity)
                    self.m_question.add_quantified_non_entity(
                        quantified_non_entity)