Example #1
0
def ourLesk(sentence, word, pos1, forceResponse = False):
	
	leskList = []
	if pos is not None:
		possibility1 = pylesk.cosine_lesk(sentence, word, pos1)
		possibility2 = pylesk.adapted_lesk(sentence, word)
		
	else:
		possibility1 = pylesk.cosine_lesk(sentence, word)
		possibility2 = pylesk.adapted_lesk(sentence, word)

	
	if possibility1 is not None and possibility2 is not None:
		possibility1 = [str(lemma.name()) for lemma in possibility1.lemmas()]
		possibility2 = [str(lemma.name()) for lemma in possibility2.lemmas()]
		leskList = set(possibility1).intersection(possibility2)
	else:
		if possibility1 is None:
			if possibility2 is not None:
				leskList = [str(lemma.name()) for lemma in possibility2.lemmas()]
			else:
				return None
		else:
			leskList = [str(lemma.name()) for lemma in possibility1.lemmas()]

	
	if len(leskList) > 0:
		print "-------"
		print word
		print leskList
		return list(leskList)
	else:
		return None
Example #2
0
def disambiguateWordSenses3(sentence,word,stanfordPOS, senti_db):        #disambiguation with simple_lesk
    result_list=cosine_lesk(sentence,word,nbest=True)         #result is a list of synsets of word
    result = None
    print word,stanfordPOS
    if result_list:
        for ss,score in result_list:
            pos=ss.pos()
            if (pos == u's'):
                pos = u'a'
            if pos == stanfordPOS:
                result  = ss
                break
    if result:
        pos = result.pos()
        if (pos == u's'):
            pos = u'a'
        offset = result.offset()
        pos_score=0.0
        neg_score=0.0
        if (pos, offset) in senti_db:
            pos_score, neg_score = senti_db[(pos, offset)]
        obj = 1.0-(pos_score+neg_score)
    else:
        obj=1.0
        pos=None
        pos_score=0.0
        neg_score=0.0
    return obj,pos,pos_score,neg_score
Example #3
0
 def disambiguateWordSenses3(self,sentence,word,stanfordPOS):        #disambiguation with simple_lesk
     #result=simple_lesk(sentence,word)
     print word,stanfordPOS
     result_list=cosine_lesk(sentence,word,nbest=True)         #result is a list of synsets of word
     #print result_list
     result = None
     print word,stanfordPOS
     if result_list:
         for ss, score in result_list:
             pos=ss.pos()
             if (pos == u's'):
                 pos = u'a'
             if pos == stanfordPOS:
                 result  = ss
                 print "matched"
                 break
     if result:
         pos = result.pos()
         if (pos == u's'):
             pos = u'a'
         offset = result.offset()
         pos_score=0.0
         neg_score=0.0
         if (pos, offset) in self.db:
      #       print word,pos,offset
             pos_score, neg_score = self.db[(pos, offset)]
         obj = 1.0-(pos_score+neg_score)
         #print "%%%%%%%%%%"
         #print pos_score,neg_score, obj
     else:
         obj=1.0
         pos=None
         pos_score=0.0
         neg_score=0.0
     return obj,pos,pos_score,neg_score
Example #4
0
def wsd(text, quary):
    
    sentences =  sent_tokenize(text)
    find_sent = ''
    tag = None

    for sent in sentences:
        if quary in sent:
            find_sent = sent
            break
    synonyms = []

    tag2tag = {
        'NN': 'n',
        'NNS': 'n',
        'RN': 'r',
        'VB': 'v',
        'VBP': 'v',
        'VBD': 'v',
        'VBZ': 'v',
        'VBG': 'v',
        'JJ' : 'a'
    }


    if find_sent != "":
        tags = pos_tag(word_tokenize(find_sent))
        tag = [x[1] for x in tags if x[0]==quary][0]
        try:
            tag = tag2tag[tag]
        except KeyError:
            tag = None
        
        answer = cosine_lesk(find_sent, quary, pos=None, context_is_lemmatized=True, nbest=True)
        for syn in wordnet.synsets(quary):
            for l in syn.lemmas():
                synonyms.append(l.name())
            synonyms = list(set(synonyms))

        print("Synonyms: {}".format(', '.join(synonyms)))
        print("The best definition: {}".format(answer[0][1].definition()))
        print()
        definitions ={}
        for ans in answer:
            definitions[ans[1].definition()] = ceil(ans[0]*100)/100
            #print("Definition: {0}, The similarity is {1}".format(ans[1].definition(), ans[0]))
        return ', '.join(synonyms), answer[0][1].definition(), definitions
    else:
        return ''
Example #5
0
def get_synset(metode, word, text):
    synset = ""
    if metode == "original_lesk":
        synset = simple_lesk(text, word)
    elif metode == "simple_lesk":
        synset = adapted_lesk(text, word)
    elif metode == "adapted_lesk":
        synset = cosine_lesk(text, word)
    # elif metode == "path" :
    #     synset = max_similarity(text, word, "path")
    # elif metode == "path" :
    #     synset = max_similarity(text, word, "wup")
    # elif metode == "path" :
    #     synset = max_similarity(text, word, "lin")
    # elif metode == "path" :
    #     synset = max_similarity(text, word, "res")
    # elif metode == "random_sense":
    #     synset = random_sense(word)
    # elif metode == "first_sense":
    #     synset = first_sense(word)
    # elif metode == "most_frequent_sense":
    #     synset = most_frequent_sense(word)
    return synset
Example #6
0
 def disambiguateWordSenses3(self,sentence,word):        #disambiguation with simple_lesk
     #result=simple_lesk(sentence,word)
     result=cosine_lesk(sentence,word)         #result is a list of synsets of word
     #print result_list
     if result:
         pos = result.pos()
         if (pos == u's'):
             pos = u'a'
         offset = result.offset()
         pos_score=0.0
         neg_score=0.0
         if (pos, offset) in self.db:
      #       print word,pos,offset
             pos_score, neg_score = self.db[(pos, offset)]
         obj = 1.0-(pos_score+neg_score)
         #print "%%%%%%%%%%"
         #print pos_score,neg_score, obj
     else:
         obj=1.0
         pos=None
         pos_score=0.0
         neg_score=0.0
     return obj,pos,pos_score,neg_score
Example #7
0
def get_def(word, context, lang):

    #job = json.loads(injob.text)
    #lang = job.lang
    #context = job.context
    #word = job.word

    # remove non alphanumeric chars
    context = remove_notalpha(context)
    doc = nlp(context)
    if lang != 'eng':
        #call for translation to proper lang
        getstr = "https://glosbe.com/gapi/translate?from=" + lang + "&dest=eng&format=json&phrase=" + word + "&pretty=true"
        response = requests.get(getstr)
        indef = json.loads(response.text)
        word = find_token(indef, doc)
    else:
        for token in doc:
            if word == token.text:
                word = token
                break

    # do two seperate lesks
    answer = simple_lesk(context, word.text, pos_convert(word.pos_))
    cosans = cosine_lesk(context, word.text, pos_convert(word.pos_))

    # find what we hope is the better answer
    if (check_def(context, cosans.definition()) > check_def(
            context, answer.definition())):
        answer = cosans

    sense = str(answer)
    sense = sense.split("'")[1].split(".")

    if ((sense[0] != word.lemma_ or int(sense[2]) > 4)
            and word.pos_ != 'PROPN'):
        try:
            answer = wn.synset(word.lemma_ + '.' + pos_convert(word.pos_) +
                               '.01')
        except Exception:
            pass

    if lang != 'eng':
        if lang == 'spa':
            lang = 'es'
        if lang == 'arb':
            lang = 'ar'
        #this should use the spa or arb word given
        if len(indef['tuc']) > 0:
            meaning = ""
            for tuc in indef['tuc']:
                try:
                    if tuc['phrase']['text'] == word.lemma_:
                        esptemp = ""
                        for m in tuc['meanings']:
                            if m['language'] == lang and len(
                                    m['text']) > len(meaning):
                                meaning = m['text']
                except KeyError:
                    pass
    else:
        # needs to look for beginning of sentence
        if (word.pos_ == 'PROPN'):
            meaning = word.text + " is a proper noun."
        elif answer:
            meaning = answer.definition()
    return meaning
Example #8
0
def get_def(injob):
    lang = injob['language']
    context = injob['context'].lower()
    word = injob['word'].lower()
    # make proper names into iso standard
    if lang == 'English':
        lang = 'eng'
    if lang == 'Spanish':
        lang = 'spa'
    if lang == 'Arabic':
        lang = 'arb'
    if lang == 'French':
        lang = 'fra'

    # remove non alphanumeric chars

    doc = nlp(context)

    if lang != 'eng':
        if lang == 'fra':
            stoken = flp(word)
        if lang == 'spa':
            stoken = slp(word)
        for token in stoken:
            print(token.lemma_)
            word = token.lemma_.lower()
        # call for translation to proper lang
        getstr = "https://glosbe.com/gapi/translate?from="+ lang + "&dest=eng&format=json&phrase=" + word + "&pretty=true"
        response = requests.get(getstr)
        indef = json.loads(response.text)
        word = find_token(indef, doc, lang)
        if isinstance(word, str):
            return word
    else:
        for token in doc:
            if word == token.text:
                word = token
                break
    if word and (word.is_stop or word.text == 'I'):
        if lang != 'eng':
            return find_def(indef, lang, word)
        else:
            if word.text == 'I':
                response = "Singular first person pronoun."
            else:
                try:
                    a = o.get_info_about_word(word.lemma_).json()
                except Exception:
                    a = o.get_info_about_word(word.text).json()
                response = a['results'][0]['lexicalEntries'][0][
                    'entries'][0]['senses'][0]['definitions'][0]
            return response

    if word:
        # do two seperate lesks
        answer = simple_lesk(context, word.text,
                             pos_convert(word.pos_))
        cosans = cosine_lesk(context, word.text,
                             pos_convert(word.pos_))

        # find what we hope is the better answer
        if(check_def(context, cosans.definition()) >
           check_def(context, answer.definition())):
            answer = cosans

        sense = str(answer)
        sense = sense.split("'")[1].split(".")

        if ((sense[0] != word.lemma_ or
             int(sense[2]) > 4) and word.pos_ != 'PROPN'):
            try:
                answer = wn.synset(word.lemma_ + '.' +
                                   pos_convert(word.pos_) +
                                   '.01')
            except Exception:
                pass

        # probably broken now the stemmer had problems with capitolization
        if (word.pos_ == 'PROPN'):
            meaning = word.text + " is a proper noun."
        elif lang != 'eng' and len(indef['tuc']) > 0:
            # this should use the spa or arb word given
            meaning = find_def(indef, lang, word)
        elif answer:
            meaning = answer.definition()

        if meaning:
            print("meaning: " + meaning)
            return meaning
        elif lang == 'eng':
            return "Sorry, I don't know that definintion:("
        elif lang == 'spa':
            return "Lo siento, no sé esa definición:("
        elif lang == 'fra':
            return "Désolé, je ne connais pas cette définition:("
    elif lang == 'eng':
        return "Sorry, I don't know that definintion:("
    elif lang == 'spa':
        return "Lo siento, no sé esa definición:("
    elif lang == 'fra':
        return "Désolé, je ne connais pas cette définition:("
Example #9
0
print "Context:", bank_sents[0]
answer = adapted_lesk(bank_sents[0],'bank','n', True, \
                     nbest=True, keepscore=True)
print "Senses ranked by #overlaps:", answer
best_sense = answer[0][1]
try: definition = best_sense.definition() 
except: definition = best_sense.definition
print "Definition:", definition
print

print "======== TESTING cosine_lesk ===========\n"
from pywsd.lesk import cosine_lesk

print "#TESTING cosine_lesk() ..."
print "Context:", bank_sents[0]
answer = cosine_lesk(bank_sents[0],'bank')
print "Sense:", answer
try: definition = answer.definition() 
except: definition = answer.definition
print "Definition:", definition
print

print "#TESTING cosine_lesk() with nbest results..."
print "Context:", bank_sents[0]
answer = cosine_lesk(bank_sents[0],'bank', nbest=True)
print "Senses ranked by #overlaps:", answer
best_sense = answer[0][0]
try: definition = best_sense.definition() 
except: definition = best_sense.definition
print "Definition:", definition
print
Example #10
0
print "#TESTING adapted_lesk() with pos, stem, nbest and scores."
print "Context:", bank_sents[0]
answer = adapted_lesk(bank_sents[0],'bank','n', True, \
                     nbest=True, keepscore=True)
print "Senses ranked by #overlaps:", answer
best_sense = answer[0][1]
definition = best_sense.definition()
print "Definition:", definition
print

print "======== TESTING cosine_lesk ===========\n"
from pywsd.lesk import cosine_lesk

print "#TESTING cosine_lesk() ..."
print "Context:", bank_sents[0]
answer = cosine_lesk(bank_sents[0], 'bank')
print "Sense:", answer
definition = answer.definition()
print "Definition:", definition
print

print "#TESTING cosine_lesk() with nbest results..."
print "Context:", bank_sents[0]
answer = cosine_lesk(bank_sents[0], 'bank', nbest=True)
print "Senses ranked by #overlaps:", answer
best_sense = answer[0][1]
definition = best_sense.definition()
print "Definition:", definition
print

print "======== TESTING baseline ===========\n"
Example #11
0
def get_wordnet_related_words_from_word(word,
                                        context,
                                        synonyms=1,
                                        hypernyms=0,
                                        hyponyms=0):
    """
	Method to generate a list of words that are found to be related to the input word through
	the WordNet ontology and resource. The correct sense of the input word to be used within the
	context of WordNet is picked based on disambiguation from the PyWSD package which takes
	the surrounding text (or whatever text is provided as context) into account. All synonyms,
	hypernyms, and hyponyms are considered to be related words in this case.
	
	Args:
		word (str): The word for which we want to find related words.

		context (str): Text to use for word-sense disambigutation, usually sentence the word is in.

		synonyms (int, optional): Set to 1 to include synonyms in the set of related words.
		
		hypernyms (int, optional): Set to 1 to included hypernyms in the set of related words.
		
		hyponyms (int, optional): Set to 1 to include hyponyms in the set of related words.
	
	Returns:
		list: The list of related words that were found, could be empty if nothing was found.
	"""

    # To get the list of synsets for this word if not using disambiguation.
    list_of_possible_s = wordnet.synsets(word)

    # Disambiguation of synsets (https://github.com/alvations/pywsd).
    # Requires installation of non-conda package PyWSD from pip ("pip install pywsd").
    # The methods of disambiguation that are supported by this package are:
    # (simple_lesk, original_lesk, adapted_lesk, cosine_lesk, and others).
    s = cosine_lesk(context, word)

    try:
        # Generate related words using wordnet, including synonyms, hypernyms, and hyponyms.
        # The lists of hypernyms and hyponyms need to be flattened because they're lists of lists from synsets.
        # definition() yields a string.
        # lemma_names() yields a list of strings.
        # hypernyms() yields a list of synsets.
        # hyponyms() yields a list of synsets.
        synset_definition = s.definition()
        synonym_lemmas = s.lemma_names()
        hypernym_lemmas_nested_list = [x.lemma_names() for x in s.hypernyms()]
        hyponym_lemmas_nested_list = [x.lemma_names() for x in s.hyponyms()]
        # Flatten those lists of lists.
        hypernym_lemmas = list(
            itertools.chain.from_iterable(hypernym_lemmas_nested_list))
        hyponym_lemmas = list(
            itertools.chain.from_iterable(hyponym_lemmas_nested_list))

        # Print out information about the synset that was picked during disambiguation.
        #print(synset_definition)
        #print(synonym_lemmas)
        #print(hypernym_lemmas)
        #print(hyponym_lemmas)
        related_words = []
        if synonyms == 1:
            related_words.extend(synonym_lemmas)
        if hypernyms == 1:
            related_words.extend(hypernym_lemmas)
        if hyponyms == 1:
            related_words.extend(hyponym_lemmas)
        return (related_words)

    except AttributeError:
        return ([])
Example #12
0
            for word in con:
                if word == token.text:
                    context = token.text + " "
            answer = simple_lesk(context, token.text, pos_convert(token.pos_))
            print(answer)
            if not answer:
                continue
        except Exception:
            continue

        sense = split_syn(answer)
        print(sense[0] + " " + token.lemma_)
        if ((sense[0] != token.lemma_ or int(sense[2]) > 4)
                and token.pos_ != 'PROPN'):
            try:
                cosans = cosine_lesk(context, token.text,
                                     pos_convert(token.pos_))
                if (check_def(context, cosans.definition()) > check_def(
                        context, answer.definition())):
                    answer = cosans
                if ((sense[0] != token.lemma_ or int(sense[2]) > 4)
                        and token.pos_ != 'PROPN'):
                    answer = wn.synset(token.lemma_ + '.' +
                                       pos_convert(token.pos_) + '.01')
                    print("unlikely sense detected - new sense:")
                    print(answer)
            except Exception:
                pass
        # needs to look for beginning of sentence
        if (token.pos_ == 'PROPN'):
            print(token.text + " is a proper noun.")
        elif answer:
Example #13
0
def wsd_lesk(raw_df, algorithm_choice):
    """This finds the synset of the word using
        the original sentence as context and
        different lesk algorithms from nltk-
        and pywsd-packages.

        Algorithm choices are: 1. nltk's lesk
        2. pywsd simple_lesk, 3. pywsd advanced_lesk."""
    start = timer()
    algorithm_dict = {1: "nltk_lesk", 2: "pywsd_simple_lesk",
                      3: "pywsd_advanced_lesk", 4: "pywsd_cosine_lesk"}
    df = raw_df
    full_aspect_synset_list = []
    full_aspect_synset_list_definition = []
    aspect_synset_list_definition = []
    aspect_synset_list = []
    opinion_synset_list = []
    opinion_synset_list_definition = []
    full_opinion_synset_list = []
    full_opinion_synset_list_definition = []
    aspect_opinion = ["aspect_tags", "opinion_tags"]
    tokenized_sentences = raw_df["tokenized_sentence"]
    non_tokenized_sentences = raw_df["original_text"]

    for opinion_list in aspect_opinion:
        for i, phrase in enumerate(df[opinion_list]):
            multiple_word_found = False
            for j, word in enumerate(phrase):
                special_word = False
                if multiple_word_found is False:
                    # Check here for special words such as "bug".
                    aspect = check_for_special_word(word)
                    if aspect is not None:
                        special_word = True
                    wn_check = []
                    if len(phrase) >= 2:
                        k = 0
                        temporary_combined_word = []
                        while k < len(phrase):
                            temporary_combined_word.append(phrase[k][0])
                            k += 1
                        combined_word_string = '_'.join(temporary_combined_word)
                        wn_check = wn.synsets(combined_word_string, pos=find_wordnet_pos(word[1]))
                        multiple_word_found = True
                    if len(wn_check) == 0:
                        wn_check = wn.synsets(word[0], pos=find_wordnet_pos(word[1]))
                        multiple_word_found = False
                    if len(wn_check) > 0:
                        if special_word is False:
                            if algorithm_choice == 1:
                                if multiple_word_found is True:
                                    aspect = lesk(tokenized_sentences[i], combined_word_string, find_wordnet_pos(word[1]))
                                else:
                                    aspect = lesk(tokenized_sentences[i], word[0], find_wordnet_pos(word[1]))
                            if algorithm_choice == 2:
                                if multiple_word_found is True:
                                    aspect = pylesk.simple_lesk(non_tokenized_sentences[i], combined_word_string, find_wordnet_pos(word[1]))
                                else:
                                    aspect = pylesk.simple_lesk(non_tokenized_sentences[i], word[0], find_wordnet_pos(word[1]))
                            if algorithm_choice == 3:
                                if multiple_word_found is True:
                                    aspect = pylesk.adapted_lesk(non_tokenized_sentences[i], combined_word_string,
                                                             find_wordnet_pos(word[1]))
                                else:
                                    aspect = pylesk.adapted_lesk(non_tokenized_sentences[i], word[0], find_wordnet_pos(word[1]))
                            if algorithm_choice == 4:
                                if multiple_word_found is True:
                                    aspect = pylesk.cosine_lesk(non_tokenized_sentences[i], combined_word_string,
                                                            find_wordnet_pos(word[1]))
                                else:
                                    aspect = pylesk.cosine_lesk(non_tokenized_sentences[i], word[0], find_wordnet_pos(word[1]))
                        if aspect is not None:
                            if opinion_list is "aspect_tags":
                                aspect_synset_list.append(aspect)
                                aspect_synset_list_definition.append(aspect.definition())
                            else:
                                opinion_synset_list.append(aspect)
                                opinion_synset_list_definition.append(aspect.definition())
            if opinion_list is "aspect_tags":
                full_aspect_synset_list.append(aspect_synset_list)
                full_aspect_synset_list_definition.append(aspect_synset_list_definition)
                aspect_synset_list = []
                aspect_synset_list_definition = []
            else:
                full_opinion_synset_list.append(opinion_synset_list)
                full_opinion_synset_list_definition.append(opinion_synset_list_definition)
                opinion_synset_list = []
                opinion_synset_list_definition = []
    df[algorithm_dict[algorithm_choice] + "_aspect_synset"] = pd.Series(full_aspect_synset_list).values
    df[algorithm_dict[algorithm_choice] + "_aspect_definition"] = pd.Series(full_aspect_synset_list_definition).values
    df[algorithm_dict[algorithm_choice] + "_opinion_synset"] = pd.Series(full_opinion_synset_list).values
    df[algorithm_dict[algorithm_choice] + "_opinion_definition"] = pd.Series(full_opinion_synset_list_definition).values
    end = timer()
    logging.debug("WSD Lesk Time: %.2f seconds" % (end - start))
    return df
Example #14
0
for eachword in words:
    if has_synset(eachword):
        answer = adapted_lesk(raw_sentence, eachword)
        adaptedlesk_answer.append(answer)
        print "Sense :", answer
        print eachword+":"+answer.definition()+"\n"
    else:
        print eachword+": "+eachword+"\n"
        adaptedlesk_answer.append(eachword)
        
        
print "\nDisambiguating your sentence word by word using Cosine Lesk algorithm. Hold on. \n======================================================"

for eachword in words:
    if has_synset(eachword):
        answer = cosine_lesk(raw_sentence, eachword)
        cosinelesk_answer.append(answer)
        print "Sense :", answer
        print eachword+":"+answer.definition()+"\n"
    else:
        print eachword+": "+eachword+"\n"
        cosinelesk_answer.append(eachword)
        
print "Word Definition Comparison\n====================================\n"
    
for i in range(len(simplelesk_answer)):  # assuming the lists are of the same length
    print "\n============================================================"
    print "\nWord being compared is: "+words[i]
    if simplelesk_answer[i]==adaptedlesk_answer[i]==cosinelesk_answer[i]:
        print "\nSame definition in all algorithms."