コード例 #1
0
ファイル: leskanswers.py プロジェクト: tgrant59/pydante
def get_lesk_answers(senseval_data):
    time_start = time.clock()

    # Getting answers from lesk algorithms
    original_lesk_answers = {}
    simple_lesk_answers = {}
    adapted_lesk_answers = {}
    for sentence_data in senseval_data:
        for phrase in sentence_data["test_phrases"]:
            word_id, word = phrase["headword"]
            original_lesk_answers[word_id] = lesk.original_lesk(" ".join(sentence_data["sentence"]), word)
            simple_lesk_answers[word_id] = lesk.simple_lesk(" ".join(sentence_data["sentence"]), word)
            adapted_lesk_answers[word_id] = lesk.adapted_lesk(" ".join(sentence_data["sentence"]), word)
        for word_id, word in sentence_data["test_words"].iteritems():
            original_lesk_answers[word_id] = lesk.original_lesk(" ".join(sentence_data["sentence"]), word)
            simple_lesk_answers[word_id] = lesk.simple_lesk(" ".join(sentence_data["sentence"]), word)
            adapted_lesk_answers[word_id] = lesk.adapted_lesk(" ".join(sentence_data["sentence"]), word)
        sys.stdout.write(".")
    lesk_answers_list = []
    lesk_answers_list.append((original_lesk_answers, "original lesk"))
    lesk_answers_list.append((simple_lesk_answers, "simple lesk"))
    lesk_answers_list.append((adapted_lesk_answers, "adapted lesk"))

    time_end = time.clock()
    print "\nlesk took " + str(time_end - time_start) + " seconds"
    return lesk_answers_list
コード例 #2
0
ファイル: wordifier.py プロジェクト: singhketan/Synonymizer
def ourLesk(sentence, word, pos1, forceResponse = False):
	
	leskList = []
	if pos is not None:
		possibility1 = pylesk.cosine_lesk(sentence, word, pos1)
		possibility2 = pylesk.adapted_lesk(sentence, word)
		
	else:
		possibility1 = pylesk.cosine_lesk(sentence, word)
		possibility2 = pylesk.adapted_lesk(sentence, word)

	
	if possibility1 is not None and possibility2 is not None:
		possibility1 = [str(lemma.name()) for lemma in possibility1.lemmas()]
		possibility2 = [str(lemma.name()) for lemma in possibility2.lemmas()]
		leskList = set(possibility1).intersection(possibility2)
	else:
		if possibility1 is None:
			if possibility2 is not None:
				leskList = [str(lemma.name()) for lemma in possibility2.lemmas()]
			else:
				return None
		else:
			leskList = [str(lemma.name()) for lemma in possibility1.lemmas()]

	
	if len(leskList) > 0:
		print "-------"
		print word
		print leskList
		return list(leskList)
	else:
		return None
コード例 #3
0
ファイル: leskanswers.py プロジェクト: tgrant59/pydante
def get_lesk_answers(senseval_data):
    time_start = time.clock()

    # Getting answers from lesk algorithms
    original_lesk_answers = {}
    simple_lesk_answers = {}
    adapted_lesk_answers = {}
    for sentence_data in senseval_data:
        for phrase in sentence_data["test_phrases"]:
            word_id, word = phrase["headword"]
            original_lesk_answers[word_id] = lesk.original_lesk(
                " ".join(sentence_data["sentence"]), word)
            simple_lesk_answers[word_id] = lesk.simple_lesk(
                " ".join(sentence_data["sentence"]), word)
            adapted_lesk_answers[word_id] = lesk.adapted_lesk(
                " ".join(sentence_data["sentence"]), word)
        for word_id, word in sentence_data["test_words"].iteritems():
            original_lesk_answers[word_id] = lesk.original_lesk(
                " ".join(sentence_data["sentence"]), word)
            simple_lesk_answers[word_id] = lesk.simple_lesk(
                " ".join(sentence_data["sentence"]), word)
            adapted_lesk_answers[word_id] = lesk.adapted_lesk(
                " ".join(sentence_data["sentence"]), word)
        sys.stdout.write(".")
    lesk_answers_list = []
    lesk_answers_list.append((original_lesk_answers, "original lesk"))
    lesk_answers_list.append((simple_lesk_answers, "simple lesk"))
    lesk_answers_list.append((adapted_lesk_answers, "adapted lesk"))

    time_end = time.clock()
    print "\nlesk took " + str(time_end - time_start) + " seconds"
    return lesk_answers_list
コード例 #4
0
def get_similarity():
    que_topics, que = get_suggested_answer_topics()
    answer_topics, text = get_student_answer_topics()
    length = len(list(set(que_topics) & set(answer_topics)))
    print(str(length) + " topics matched")
    # Calculating the score based on number of topics matched
    topics_score = abs(topic_match(que_topics, answer_topics, length))
    print("")
    print(topics_score)
    synsets_que_topics = []
    synsets_ans_topics = []
    sim_score = 0
    # calculating similarity using wordnet's wup_similarity
    # Getting appropriate sense of topic from the text using "lesk"(word sense disambiguation algorithm)
    for i in que_topics:
        synsets_que_topics.append(adapted_lesk(que, i, pos='n'))
    for i in answer_topics:
        synset_answer = adapted_lesk(text, i, pos='n')
        print(str(synset_answer) + '..')
        if str(synset_answer) != "None":
            synsets_ans_topics.append(synset_answer)
    print("Similarity Score")
    sim_score = compute_similarity(synsets_que_topics,
                                   synsets_ans_topics) * 100
    print(sim_score)
    print("")
    print("Average Score: " + str(abs(topics_score + int(sim_score) / 2)))
コード例 #5
0
 def _get_disambiguated_synset(self, token: TokenEN,
                               text: TextHolderEN) -> Optional[Synset]:
     return adapted_lesk(
         context_sentence=text.raw_text,
         ambiguous_word=token.lemma_extended.replace(" ", "_"),
         pos=token.pos_simple,
     )
def get_syns(story_dict, cast_no1, cast_no2):
    syn_dict = OrderedDict()
    syns1 = []
    story1 = []
    story1 = remove_stop_words((tokenise(story_dict[cast_no1])))
    for word in story1:
        syns1.append(adapted_lesk(story_dict[cast_no1], word))
    syn_dict[cast_no1] = syns1
    syns2 = []
    story2 = []
    story2 = remove_stop_words((tokenise(story_dict[cast_no1])))
    for word in story2:
        syns2.append(adapted_lesk(story_dict[cast_no2], word))
    syn_dict[cast_no2] = syns2

    return syn_dict
コード例 #7
0
def getDef(sent, targetWord):
    # Get defintion of word

    #defineSent = cosine_lesk(sent,targetWord).definition()
    defineSent = adapted_lesk(sent, targetWord).definition()

    return defineSent
def get_syns(story_dict, cast_no1, cast_no2):
    syn_dict = OrderedDict()
    syns1 = []
    story1 = []
    story1 = remove_stop_words((tokenise(story_dict[cast_no1])))
    for word in story1:
        syns1.append(adapted_lesk(story_dict[cast_no1], word))
    syn_dict[cast_no1] = syns1
    syns2 = []
    story2 = []
    story2 = remove_stop_words((tokenise(story_dict[cast_no1])))
    for word in story2:
        syns2.append(adapted_lesk(story_dict[cast_no2], word))
    syn_dict[cast_no2] = syns2

    return syn_dict
コード例 #9
0
def readGenreFilesAndTagWordsForSenses(core_nlp_files):
    for genre_file_path, genre_file_name in core_nlp_files:
        dictionary = dict()
        with open(genre_file_path) as f:
            print 'Processing File', genre_file_path
            synset_wsd_file = genre_file_path.replace(CORE_NLP_FILE_SUFFIX, SYNSET_WSD_FILE_SUFFIX)
            if os.path.exists(synset_wsd_file):
                continue
            lines = f.readlines()[:100]
            output = []
            for line in lines:
                line = 'dictionary=' + line
                exec(line)
                sentences = dictionary[SENTENCES]
                for sent in sentences:
                    parsetree = sent[PARSE_TREE]
                    t = ParentedTree.fromstring(parsetree)
                    sentence_result = []
                    txt = sent[TXT]
                    for word, pos in t.pos():
                        if re.match(POS_PATTERN_FOR_WSD, pos) and pos not in ['DT', 'CC', 'CD']:
                            ranked_synsets = lsk.adapted_lesk(unicode(txt), unicode(word))
                            ranked_synset_prob_names = None
                            if ranked_synsets:
                                ranked_synset_prob_names = [(prob, ranked_synset.name())\
                                                            for prob, ranked_synset in ranked_synsets]
                            result = (word, ranked_synset_prob_names)
                            sentence_result.append(result)
                    output.append(sentence_result)

            with open(synset_wsd_file, 'w') as f1:
                f1.write(str(output))
コード例 #10
0
ファイル: wsd.py プロジェクト: Shoop123/Reezy-NLP-Django
def get_disambiguated_definition(sentence, word, pos):
    translated_pos = get_wordnet_pos(pos)
    try:
        synset = adapted_lesk(sentence, word, pos=translated_pos)
    except:
        synset = None

    if synset is None:
        return word
    else:
        return synset.definition()
コード例 #11
0
def get_syns(story_dict):
    syn_dict = OrderedDict()
    i = 0
    while i < len(story_dict):
        key = 'cast' + ` i `
        syns = []
        story = []
        story = remove_stop_words((tokenise(story_dict[key])))
        for word in story:
            syns.append(adapted_lesk(story_dict[key], word))
        syn_dict[key] = syns
        i += 1
    return syn_dict
コード例 #12
0
	def get_wordsense(self,sent,word):
		word= word.lower()
		if len(word.split())>0:
			word = word.replace(" ","_")
		
		
		synsets = wn.synsets(word,'n')
		if synsets:
			wup = max_similarity(sent, word, 'wup', pos='n')
			adapted_lesk_output =  adapted_lesk(sent, word, pos='n')
			lowest_index = min (synsets.index(wup),synsets.index(adapted_lesk_output))
			return synsets[lowest_index]
		else:
			return None
コード例 #13
0
def seperateByDef(targetWord):
    # Returns a dictionary sorted by defintion
    sentList = scrape.scrape(targetWord)

    dictDef = {}
    for i, sent in enumerate(sentList):

        #defineSent = cosine_lesk(sent,targetWord).definition()
        defineSent = adapted_lesk(sent, targetWord).definition()

        if defineSent not in dictDef:
            dictDef[defineSent] = [sent]
        else:
            dictDef[defineSent].append(sent)
    return dictDef
コード例 #14
0
def get_syns(story_dict):
    syn_dict = OrderedDict()
    i = 0
    while i<len(story_dict):
        key = 'cast' + `i`
        syns = []
        story = []
        story = remove_stop_words((tokenise(story_dict[key])))
        for word in story:
            syns.append(adapted_lesk(story_dict[key], word))
        syn_dict[key] = syns


        i+=1
    return syn_dict
コード例 #15
0
def bayes_theorem(context, vocab, word_count, sum_word, word_median):
    words_probs = {}
    print len(vocab)
    count = 0
    for word in vocab:
        if count % 1000 == 0:
            print 'word ' + str(count)
        count += 1
        sent = context

        ambiguous = vocab.get(word).split("_")[0]
        post = vocab.get(word).split("_")[1]
        #print ambiguous, post
        try:
            answer = adapted_lesk(sent,
                                  ambiguous,
                                  pos=penn_to_wn(post),
                                  nbest=True)
        except Exception, e:
            continue
        total = 0
        for j in range(len(answer)):
            total += answer[j][0]

        if total == 0:
            continue

        for j in range(len(answer)):
            if answer[j][0] == 0:
                continue
            prob_w = 0.0
            prob_s_w = float(answer[j][0]) / total

            if word_count.has_key(vocab.get(word)):
                prob_w = word_count.get(vocab.get(word)) / float(sum_word)
            else:
                prob_w = word_median

            prob_w_s = prob_s_w * prob_w

            if words_probs.has_key(word):
                aux = words_probs.get(word)
                aux[int(answer[j][1].offset)] = prob_w_s
                words_probs[word] = aux
            else:
                aux = {}
                aux[int(answer[j][1].offset)] = prob_w_s
                words_probs[word] = aux
コード例 #16
0
def word_sense(sentence, keyword):
    print("5.Getting word sense to obtain best MCQ options with WordNet...")
    word = keyword.lower()
    if len(word.split())>0:
        word = word.replace(" ","_")  
    syon_sets = wordnet.synsets(word,'n')
    if syon_sets:
        try:
            wup = max_similarity(sentence, word, 'wup', pos='n')
            adapted_lesk_output =  adapted_lesk(sentence, word, pos='n')
            lowest_index = min(syon_sets.index(wup),syon_sets.index(adapted_lesk_output))
            return syon_sets[lowest_index]
        except:
            return syon_sets[0]           
    else:
        return None
コード例 #17
0
ファイル: wsd.py プロジェクト: aferrugento/SemLDA
def bayes_theorem(context, vocab, word_count, sum_word, word_median):
	words_probs = {}
	print len(vocab)
	count = 0
	for word in vocab:
		if count%1000 == 0:
			print 'word ' + str(count)
		count += 1
		sent = context

		ambiguous = vocab.get(word).split("_")[0]
		post = vocab.get(word).split("_")[1]
		#print ambiguous, post
		try:
			answer = adapted_lesk(sent, ambiguous, pos= penn_to_wn(post), nbest=True)
		except Exception, e:
			continue
		total = 0
		for j in range(len(answer)):
			total += answer[j][0]		
		
		if total == 0:
			continue

		for j in range(len(answer)):
			if answer[j][0] == 0:
				continue
			prob_w = 0.0
			prob_s_w = float(answer[j][0])/total
			
			if word_count.has_key(vocab.get(word)):
				prob_w = word_count.get(vocab.get(word))/float(sum_word)
			else:
				prob_w = word_median

			prob_w_s = prob_s_w * prob_w

			if words_probs.has_key(word):
				aux = words_probs.get(word)
				aux[int(answer[j][1].offset)] = prob_w_s
				words_probs[word] = aux
			else:
				aux = {}
				aux[int(answer[j][1].offset)] = prob_w_s
				words_probs[word] = aux
コード例 #18
0
def get_wordsense(sent, word):
    """
    
        Get a sentence of the meaning of a word, in context, using (1) Lesk algorithm and (2) max similarity
        Useful for word sense disambiguation tasks (e.g., one word means different things, 
        based on context)
    
        Paper: https://thesai.org/Downloads/Volume11No3/Paper_30-Adapted_Lesk_Algorithm.pdf
        
        The goal here is to see if the word has synonyms (or words close in meaning)
        that we could potentially use as answer choices
        
    """

    word = word.lower()

    if len(word.split()) > 0:
        word = word.replace(" ", "_")

    # get set of synonyms
    synsets = wn.synsets(word, 'n')

    if synsets:

        # get similarity between possible synsets of all words in
        # context sentence and possible synsets of ambiguous words,
        # to determine "context" of the word of interest and what it
        # "should" mean
        wup = max_similarity(sent, word, "wup", pos='n')

        # use Lesk algorithm, which will assume that words in the same
        # "neighborhood", or area of text, will tend to share the same topic.

        adapted_lesk_output = adapted_lesk(sent, word, pos="n")
        lowest_index = min(synsets.index(wup),
                           synsets.index(adapted_lesk_output))
        return synsets[lowest_index]
    else:
        print(f"No synonyms found for the word {word}")
        return None
コード例 #19
0
def get_synset(metode, word, text):
    synset = ""
    if metode == "original_lesk":
        synset = simple_lesk(text, word)
    elif metode == "simple_lesk":
        synset = adapted_lesk(text, word)
    elif metode == "adapted_lesk":
        synset = cosine_lesk(text, word)
    # elif metode == "path" :
    #     synset = max_similarity(text, word, "path")
    # elif metode == "path" :
    #     synset = max_similarity(text, word, "wup")
    # elif metode == "path" :
    #     synset = max_similarity(text, word, "lin")
    # elif metode == "path" :
    #     synset = max_similarity(text, word, "res")
    # elif metode == "random_sense":
    #     synset = random_sense(word)
    # elif metode == "first_sense":
    #     synset = first_sense(word)
    # elif metode == "most_frequent_sense":
    #     synset = most_frequent_sense(word)
    return synset
コード例 #20
0
def main(argv):
    cast_no = 'cast' + ` int(argv[0]) `
    filepath = os.path.join(os.path.expanduser('~'), 'workspace',
                            'Dissertation', 'resources', 'casts.json')
    with open(filepath) as f:
        all_casts = yaml.safe_load(f.read().encode('utf-8'))

    stories = get_stories(all_casts)
    our_story = stories[cast_no]
    syns = []
    story = []
    story = remove_stop_words(tokenise(our_story))
    for word in story:
        syns.append(adapted_lesk(our_story, word))

    print syns
    max_depth = 0
    for syn in syns:
        if syn is not None and syn.min_depth() > max_depth:
            max_depth = syn.min_depth()
            print max_depth
            print syn
    print max_depth
コード例 #21
0
ファイル: test_wsd.py プロジェクト: SericWong/pywsd
print "Context:", plant_sents[0]
answer = simple_lesk(plant_sents[0],'plant','n', True, \
                     nbest=True, keepscore=True, normalizescore=True)
print "Senses ranked by #overlaps:", answer
best_sense = answer[0][1]
try: definition = best_sense.definition() 
except: definition = best_sense.definition
print "Definition:", definition
print

print "======== TESTING adapted_lesk ===========\n"
from pywsd.lesk import adapted_lesk

print "#TESTING adapted_lesk() ..."
print "Context:", bank_sents[0]
answer = adapted_lesk(bank_sents[0],'bank')
print "Sense:", answer
try: definition = answer.definition()
except: definition = answer.definition
print "Definition:", definition
print

print "#TESTING adapted_lesk() with pos, stem, nbest and scores."
print "Context:", bank_sents[0]
answer = adapted_lesk(bank_sents[0],'bank','n', True, \
                     nbest=True, keepscore=True)
print "Senses ranked by #overlaps:", answer
best_sense = answer[0][1]
try: definition = best_sense.definition() 
except: definition = best_sense.definition
print "Definition:", definition
コード例 #22
0
ファイル: wsd.py プロジェクト: aferrugento/SemLDA
def main(file_name):
	start = time.time()
	#string = '/home/adriana/Dropbox/mine/Tese/preprocessing/data_output/'
	#string = '/home/aferrugento/Desktop/'
	string = ''
	h = open(string + file_name + '_proc.txt')
	sentences = h.read()
	h.close()
	extra_synsets = {}
	sentences = sentences.split("\n")
	for i in range(len(sentences)):
		sentences[i] = sentences[i].split(" ")
		for j in range(len(sentences[i])):
			if sentences[i][j] == '':
				continue
			sentences[i][j] = sentences[i][j].split("_")[0]

	for i in range(len(sentences)):
		aux = ''
		for j in range(len(sentences[i])):
			aux += sentences[i][j] + ' '
		sentences[i] = aux
	word_count = pickle.load(open('word_count_new.p'))
	synset_count = pickle.load(open('synset_count.p'))
	word_count_corpus = calculate_word_frequency(sentences)

	sum_word_corpus = 0
	for key in word_count_corpus.keys():
		sum_word_corpus += word_count_corpus.get(key)
	sum_word = 0
	for key in word_count.keys():
		sum_word += word_count.get(key)
	sum_synset = 0
	for key in synset_count.keys():
		sum_synset += synset_count.get(key)

	word_list = []
	for key in word_count.keys():
		word_list.append(word_count.get(key))
	synset_list = []
	for key in synset_count.keys():
		synset_list.append(synset_count.get(key))
	word_list.sort()
	synset_list.sort()

	#print len(word_list), len(synset_list)
	#print len(word_list)/2., len(synset_list)/2., (len(word_list)/2.) -1, (len(synset_list)/2.) -1
	#print word_list[len(word_list)/2], word_list[(len(word_list)/2)-1]
	#print synset_list[len(synset_list)/2], synset_list[(len(synset_list)/2)-1]
	word_median = round(2./sum_word, 5)
	synset_median = round(2./sum_synset, 5)
	#print word_median, synset_median
	#print sum_word, sum_synset
	#return

	
	#f = open(string + 'preprocess_semLDA_EPIA/NEWS2_snowballstopword_wordnetlemma_pos_freq.txt')
	f = open(string + file_name +'_freq.txt')
	m = f.read()
	f.close()
	m = m.split("\n")

	for i in range(len(m)):
		m[i] = m[i].split(" ")

	count = 0
	imag = -1
	#f = open(string + 'preprocess_semLDA_EPIA/znew_eta_NEWS2.txt')
	f = open(string + file_name + '_eta.txt')
	g = f.read()
	f.close()

	g = g.split("\n")
	for i in range(len(g)):
		g[i] = g[i].split(" ")


	dic_g = create_dicio(g)

	g = open(string + file_name +'_wsd.txt','w')
	
	#dictio = pickle.load(open(string + 'preprocess_semLDA_EPIA/NEWS2_snowballstopword_wordnetlemma_pos_vocab.p'))
	dictio = pickle.load(open(string + file_name +'_vocab.p'))
	nn = open(string + file_name +'_synsetVoc.txt','w')
	synsets = {}
	to_write = []
	p = open(string + 'NEWS2_wsd.log','w')
	for i in range(len(m)):
		nana = str(m[i][0]) + ' '
		print 'Doc ' + str(i)
		p.write('---------- DOC ' +str(i) + ' ----------\n')
		#words_probs = bayes_theorem(sentences[i], dictio, word_count, sum_word, word_median)
		#return
		#g.write(str(m[i][0]) + ' ')
		for k in range(1, len(m[i])):
			#print sentences[i]
			
			if m[i][k] == '':
				continue
			#print dictio.get(int(m[i][k].split(":")[0])) + str(m[i][k].split(":")[0])
			#print wn.synsets(dictio.get(int(m[i][k].split(":")[0])).split("_")[0], penn_to_wn(dictio.get(int(m[i][k].split(":")[0])).split("_")[1]))
			#caso nao existam synsets para aquela palavra
			if len(wn.synsets(dictio.get(int(m[i][k].split(":")[0])).split("_")[0], penn_to_wn(dictio.get(int(m[i][k].split(":")[0])).split("_")[1]))) == 0:
				nana += m[i][k]+":1[" +str(count)+":"+str(1)+"] "
				synsets[imag] = count
				extra_synsets[imag] = dictio.get(int(m[i][k].split(":")[0]))
				#g.write(m[i][k]+":1[" +str(imag)+":"+str(1)+"] ")
				imag -= 1
				count += 1
				continue
			sent = sentences[i]
			ambiguous = dictio.get(int(m[i][k].split(":")[0])).split("_")[0]
			post = dictio.get(int(m[i][k].split(":")[0])).split("_")[1]
			try:
				answer = adapted_lesk(sent, ambiguous, pos= penn_to_wn(post), nbest=True)
			except Exception, e:
				#caso o lesk se arme em estupido

				s = wn.synsets(dictio.get(int(m[i][k].split(":")[0])).split("_")[0], penn_to_wn(dictio.get(int(m[i][k].split(":")[0])).split("_")[1]))
				if len(s) != 0:
					count2 = 0
					#ver quantos synsets existem no semcor
					#for n in range(len(s)):
					#	if dic_g.has_key(str(s[n].offset)):
					#		words = dic_g.get(str(s[n].offset))
					#		for j in range(len(words)):
					#			if words[j].split(":")[0] == m[i][k].split(":")[0]:
					#				count2 += 1
					# se nao existir nenhum criar synset imaginario
					#if count2 == 0:
					#	nana += m[i][k]+":1[" +str(count)+":"+str(1)+"] "
					#	synsets[imag] = count
					#	extra_synsets[imag] = dictio.get(int(m[i][k].split(":")[0]))
						#g.write(m[i][k]+":1[" +str(imag)+":"+str(1)+"] ")
					#	count += 1
					#	imag -= 1
					#	continue
					#caso existam ir buscar as suas probabilidades ao semcor
					nana += m[i][k] +':'+ str(len(s)) + '['
					c = 1
					prob = 1.0/len(s)
					for n in range(len(s)):
						#print answer[n][1].offset
						#print 'Coco ' + str(s[n].offset)
						#if dic_g.has_key(str(s[n].offset)):
						#words = dic_g.get(str(s[n].offset))
						#for j in range(len(words)):
						#	if words[j].split(":")[0] == m[i][k].split(":")[0]:
						#		aux = 0
						a = (s[n].offset())
								#print s[n].offset()
						if synsets.has_key(a):
							aux = synsets.get(a)
						else:
							synsets[a] = count
							aux = count
							count += 1
						if n == len(s) - 1:
							nana += str(aux) + ':' + str(prob) + '] '
						else:
							nana += str(aux) + ':' + str(prob) + ' '
				else:
					nana += m[i][k]+":1[" +str(count)+":"+str(1)+"] "
					synsets[imag] = count
					extra_synsets[imag] = dictio.get(int(m[i][k].split(":")[0]))
					#g.write(m[i][k]+":1[" +str(imag)+":"+str(1)+"] ")
					count += 1
					imag -= 1
				continue
			
			
			#g.write(m[i][k] +':'+ str(len(answer)) + '[')
			total = 0

			for j in range(len(answer)):
				total += answer[j][0]
			#caso lesk nao devolva nenhuma resposta criar synset imaginario
			if len(answer) == 0:
				nana += m[i][k]+":1[" +str(count)+":"+str(1)+"] "
				synsets[imag] = count
				extra_synsets[imag] = dictio.get(int(m[i][k].split(":")[0]))
				#g.write(m[i][k]+":1[" +str(imag)+":"+str(1)+"] ")
				count += 1
				imag -= 1
				continue

			#print ambiguous
			#print total
			#print answer
			#caso nenhum dos synsets tenha overlap ir ver ao semcor as suas probabilidades
			if total == 0:
				#print 'ZERO'
				count2 = 0
				#for n in range(len(answer)):
				#	if dic_g.has_key(str(answer[n][1].offset)):
				#		words = dic_g.get(str(answer[n][1].offset))
				#		for j in range(len(words)):
				#			if words[j].split(":")[0] == m[i][k].split(":")[0]:
				#				count2 += 1
				#if count2 == 0:
				#	nana += m[i][k]+":1[" +str(count)+":"+str(1)+"] "
				#	synsets[imag] = count
				#	extra_synsets[imag] = dictio.get(int(m[i][k].split(":")[0]))
					#g.write(m[i][k]+":1[" +str(imag)+":"+str(1)+"] ")
				#	count += 1
				#	imag -= 1
				#	continue
				s = wn.synsets(dictio.get(int(m[i][k].split(":")[0])).split("_")[0], penn_to_wn(dictio.get(int(m[i][k].split(":")[0])).split("_")[1]))
				nana += m[i][k] +':'+ str(len(s)) + '['
				c = 1
				prob = 1.0/len(s)
				for n in range(len(s)):
					#print answer[n][1].offset
					#print 'Coco ' + str(s[n].offset)
					#if dic_g.has_key(str(s[n].offset)):
					#words = dic_g.get(str(s[n].offset))
					#for j in range(len(words)):
					#	if words[j].split(":")[0] == m[i][k].split(":")[0]:
					#		aux = 0
					a = (s[n].offset())
							#print s[n].offset()
					if synsets.has_key(a):
						aux = synsets.get(a)
					else:
						synsets[a] = count
						aux = count
						count += 1
					if n == len(s) - 1:
						nana += str(aux) + ':' + str(prob) + '] '
					else:
						nana += str(aux) + ':' + str(prob) + ' '

				#print nana
				continue
			#contar quantos synsets e que nao estao a zero
			count2 = 0
			for j in range(len(answer)):
				if answer[j][0] == 0:
					continue
				else:
					count2 += 1
			c = 1
			nana += m[i][k] +':'+ str(count2) + '['
			for j in range(len(answer)):
				#words_synsets = words_probs.get(int(m[i][k].split(':')[0]))
				#s.write(answer[j][1].offset+"\n")
				if answer[j][0] == 0:
					continue
				aux = 0
				a = (answer[j][1].offset())
				#print 'Coco '+ str(answer[j][1].offset())
				if synsets.has_key(a):
					aux = synsets.get(a)
				else:
					synsets[a] = count
					aux = count
					count += 1
				prob_s = 0.0
				prob_w = 0.0
				prob_s_w = float(answer[j][0])/total
				
				#if synset_count.has_key(str(answer[j][1].offset)):
				#	prob_s = synset_count.get(str(answer[j][1].offset))/float(sum_synset)
				#else:
				#	prob_s = 0.1
				prob_s_s = 1.0/count2

				#if word_count.has_key(dictio.get(int(m[i][k].split(":")[0]))):
				#	prob_w = word_count.get(dictio.get(int(m[i][k].split(":")[0])))/float(sum_word)
				#else:
				#	prob_w = 0.1

				if word_count_corpus.has_key(dictio.get(int(m[i][k].split(":")[0])).split("_")[0]):
					prob_w = word_count_corpus.get(dictio.get(int(m[i][k].split(":")[0])).split("_")[0])/float(sum_word_corpus)
				else:
					prob_w = 0.1
				prob_w_s = (prob_w * prob_s_w) / prob_s_s 

				if j == len(answer) - 1 or count2 == c:
					if prob_w_s > 1.0:
						#print 'Word: 'dictio.get(int(m[i][k].split(":")[0])) + ' Synset: ' + str(answer[j][1])
						p.write('Word: '+ dictio.get(int(m[i][k].split(":")[0])) + ' Synset: ' + str(answer[j][1]))
						#print 'Synsets disambiguated: ' + str(answer)
						p.write('---- Synsets disambiguated: ' + str(answer))
						#print synset_count.get(str(answer[j][1].offset)), word_count.get(dictio.get(int(m[i][k].split(":")[0]))), sum_synset, sum_word
						#print 'P(s)=' +prob_s +', P(w)='+prob_w  +', P(s|w)='+ prob_s_w  +', P(w|s)='+ prob_w_s
						p.write('---- P(s)=' +str(prob_s) +', P(w)='+ str(prob_w)  +', P(s|w)='+ str(prob_s_w)  +', P(w|s)='+ str(prob_w_s))
						p.write("\n")
						nana += str(aux) + ':' + str(1) + '] '
					#nana += str(aux) + ':' + str(words_synsets.get(answer[j][1].offset)) + '] '
					else:
						nana += str(aux) + ':' + str(prob_w_s) + '] '
					#g.write(str(aux) + ':' + str(float(answer[j][0]/total)) + '] ')
				else:
					c += 1 
					if prob_w_s > 1.0:
						#print 'Word: 'dictio.get(int(m[i][k].split(":")[0])) + ' Synset: ' + str(answer[j][1])
						p.write('Word: '+ dictio.get(int(m[i][k].split(":")[0])) + ' Synset: ' + str(answer[j][1]))
						#print 'Synsets disambiguated: ' + str(answer)
						p.write('---- Synsets disambiguated: ' + str(answer))
						#print synset_count.get(str(answer[j][1].offset)), word_count.get(dictio.get(int(m[i][k].split(":")[0]))), sum_synset, sum_word
						#print 'P(s)=' +prob_s +', P(w)='+prob_w  +', P(s|w)='+ prob_s_w  +', P(w|s)='+ prob_w_s
						p.write('---- P(s)=' +str(prob_s) +', P(w)='+ str(prob_w)  +', P(s|w)='+ str(prob_s_w)  +', P(w|s)='+ str(prob_w_s))
						p.write("\n")
						nana += str(aux) + ':' + str(1) + '] '
					#nana += str(aux) + ':' + str(words_synsets.get(answer[j][1].offset)) +' '
					else:
						nana += str(aux) + ':' + str(prob_w_s) +' '
					#g.write(str(aux) + ':' + str(float(answer[j][0]/total)) +' ')
		nana += '\n'
		#print nana
		#return
		to_write.append(nana)
コード例 #23
0
from pywsd.lesk import adapted_lesk
raw_sentence=raw_input("Please enter your sentence : ")
raw_word=raw_input("Please enter input word :")

print "#TESTING adapted_lesk() with pos, stem, nbest and scores."
print "Context:", raw_sentence
answer = adapted_lesk(raw_sentence[0],raw_word,'n', True, \
                     nbest=True, keepscore=True)
print "Senses ranked by #overlaps:", answer
best_sense = answer[0][1]
try: definition = best_sense.definition() 
except: definition = best_sense.definition
print "Definition:", definition
コード例 #24
0
ファイル: disambiguation.py プロジェクト: anweshm4/ReWordTool
for eachword in words:
    if has_synset(eachword):
        answer = simple_lesk(raw_sentence, eachword)
        simplelesk_answer.append(answer)
        print "Sense :", answer
        print eachword+":"+answer.definition()+"\n"
    else:
        print eachword+": "+eachword+"\n"    
        simplelesk_answer.append(eachword)
        
        
print "\nDisambiguating your sentence word by word using Adapted Lesk algorithm. Hold on. \n======================================================"

for eachword in words:
    if has_synset(eachword):
        answer = adapted_lesk(raw_sentence, eachword)
        adaptedlesk_answer.append(answer)
        print "Sense :", answer
        print eachword+":"+answer.definition()+"\n"
    else:
        print eachword+": "+eachword+"\n"
        adaptedlesk_answer.append(eachword)
        
        
print "\nDisambiguating your sentence word by word using Cosine Lesk algorithm. Hold on. \n======================================================"

for eachword in words:
    if has_synset(eachword):
        answer = cosine_lesk(raw_sentence, eachword)
        cosinelesk_answer.append(answer)
        print "Sense :", answer
コード例 #25
0
def main(file_name):
    start = time.time()
    #string = '/home/adriana/Dropbox/mine/Tese/preprocessing/data_output/'
    #string = '/home/aferrugento/Desktop/'
    string = ''
    h = open(string + file_name + '_proc.txt')
    sentences = h.read()
    h.close()
    extra_synsets = {}
    sentences = sentences.split("\n")
    for i in range(len(sentences)):
        sentences[i] = sentences[i].split(" ")
        for j in range(len(sentences[i])):
            if sentences[i][j] == '':
                continue
            sentences[i][j] = sentences[i][j].split("_")[0]

    for i in range(len(sentences)):
        aux = ''
        for j in range(len(sentences[i])):
            aux += sentences[i][j] + ' '
        sentences[i] = aux
    word_count = pickle.load(open('word_count_new.p'))
    synset_count = pickle.load(open('synset_count.p'))
    word_count_corpus = calculate_word_frequency(sentences)

    sum_word_corpus = 0
    for key in word_count_corpus.keys():
        sum_word_corpus += word_count_corpus.get(key)
    sum_word = 0
    for key in word_count.keys():
        sum_word += word_count.get(key)
    sum_synset = 0
    for key in synset_count.keys():
        sum_synset += synset_count.get(key)

    word_list = []
    for key in word_count.keys():
        word_list.append(word_count.get(key))
    synset_list = []
    for key in synset_count.keys():
        synset_list.append(synset_count.get(key))
    word_list.sort()
    synset_list.sort()

    #print len(word_list), len(synset_list)
    #print len(word_list)/2., len(synset_list)/2., (len(word_list)/2.) -1, (len(synset_list)/2.) -1
    #print word_list[len(word_list)/2], word_list[(len(word_list)/2)-1]
    #print synset_list[len(synset_list)/2], synset_list[(len(synset_list)/2)-1]
    word_median = round(2. / sum_word, 5)
    synset_median = round(2. / sum_synset, 5)
    #print word_median, synset_median
    #print sum_word, sum_synset
    #return

    #f = open(string + 'preprocess_semLDA_EPIA/NEWS2_snowballstopword_wordnetlemma_pos_freq.txt')
    f = open(string + file_name + '_freq.txt')
    m = f.read()
    f.close()
    m = m.split("\n")

    for i in range(len(m)):
        m[i] = m[i].split(" ")

    count = 0
    imag = -1
    #f = open(string + 'preprocess_semLDA_EPIA/znew_eta_NEWS2.txt')
    f = open(string + file_name + '_eta.txt')
    g = f.read()
    f.close()

    g = g.split("\n")
    for i in range(len(g)):
        g[i] = g[i].split(" ")

    dic_g = create_dicio(g)

    g = open(string + file_name + '_wsd.txt', 'w')

    #dictio = pickle.load(open(string + 'preprocess_semLDA_EPIA/NEWS2_snowballstopword_wordnetlemma_pos_vocab.p'))
    dictio = pickle.load(open(string + file_name + '_vocab.p'))
    nn = open(string + file_name + '_synsetVoc.txt', 'w')
    synsets = {}
    to_write = []
    p = open(string + 'NEWS2_wsd.log', 'w')
    for i in range(len(m)):
        nana = str(m[i][0]) + ' '
        print 'Doc ' + str(i)
        p.write('---------- DOC ' + str(i) + ' ----------\n')
        #words_probs = bayes_theorem(sentences[i], dictio, word_count, sum_word, word_median)
        #return
        #g.write(str(m[i][0]) + ' ')
        for k in range(1, len(m[i])):
            #print sentences[i]

            if m[i][k] == '':
                continue
            #print dictio.get(int(m[i][k].split(":")[0])) + str(m[i][k].split(":")[0])
            #print wn.synsets(dictio.get(int(m[i][k].split(":")[0])).split("_")[0], penn_to_wn(dictio.get(int(m[i][k].split(":")[0])).split("_")[1]))
            #caso nao existam synsets para aquela palavra
            if len(
                    wn.synsets(
                        dictio.get(int(m[i][k].split(":")[0])).split("_")[0],
                        penn_to_wn(
                            dictio.get(int(
                                m[i][k].split(":")[0])).split("_")[1]))) == 0:
                nana += m[i][k] + ":1[" + str(count) + ":" + str(1) + "] "
                synsets[imag] = count
                extra_synsets[imag] = dictio.get(int(m[i][k].split(":")[0]))
                #g.write(m[i][k]+":1[" +str(imag)+":"+str(1)+"] ")
                imag -= 1
                count += 1
                continue
            sent = sentences[i]
            ambiguous = dictio.get(int(m[i][k].split(":")[0])).split("_")[0]
            post = dictio.get(int(m[i][k].split(":")[0])).split("_")[1]
            try:
                answer = adapted_lesk(sent,
                                      ambiguous,
                                      pos=penn_to_wn(post),
                                      nbest=True)
            except Exception, e:
                #caso o lesk se arme em estupido

                s = wn.synsets(
                    dictio.get(int(m[i][k].split(":")[0])).split("_")[0],
                    penn_to_wn(
                        dictio.get(int(m[i][k].split(":")[0])).split("_")[1]))
                if len(s) != 0:
                    count2 = 0
                    #ver quantos synsets existem no semcor
                    #for n in range(len(s)):
                    #	if dic_g.has_key(str(s[n].offset)):
                    #		words = dic_g.get(str(s[n].offset))
                    #		for j in range(len(words)):
                    #			if words[j].split(":")[0] == m[i][k].split(":")[0]:
                    #				count2 += 1
                    # se nao existir nenhum criar synset imaginario
                    #if count2 == 0:
                    #	nana += m[i][k]+":1[" +str(count)+":"+str(1)+"] "
                    #	synsets[imag] = count
                    #	extra_synsets[imag] = dictio.get(int(m[i][k].split(":")[0]))
                    #g.write(m[i][k]+":1[" +str(imag)+":"+str(1)+"] ")
                    #	count += 1
                    #	imag -= 1
                    #	continue
                    #caso existam ir buscar as suas probabilidades ao semcor
                    nana += m[i][k] + ':' + str(len(s)) + '['
                    c = 1
                    prob = 1.0 / len(s)
                    for n in range(len(s)):
                        #print answer[n][1].offset
                        #print 'Coco ' + str(s[n].offset)
                        #if dic_g.has_key(str(s[n].offset)):
                        #words = dic_g.get(str(s[n].offset))
                        #for j in range(len(words)):
                        #	if words[j].split(":")[0] == m[i][k].split(":")[0]:
                        #		aux = 0
                        a = (s[n].offset())
                        #print s[n].offset()
                        if synsets.has_key(a):
                            aux = synsets.get(a)
                        else:
                            synsets[a] = count
                            aux = count
                            count += 1
                        if n == len(s) - 1:
                            nana += str(aux) + ':' + str(prob) + '] '
                        else:
                            nana += str(aux) + ':' + str(prob) + ' '
                else:
                    nana += m[i][k] + ":1[" + str(count) + ":" + str(1) + "] "
                    synsets[imag] = count
                    extra_synsets[imag] = dictio.get(int(
                        m[i][k].split(":")[0]))
                    #g.write(m[i][k]+":1[" +str(imag)+":"+str(1)+"] ")
                    count += 1
                    imag -= 1
                continue

            #g.write(m[i][k] +':'+ str(len(answer)) + '[')
            total = 0

            for j in range(len(answer)):
                total += answer[j][0]
            #caso lesk nao devolva nenhuma resposta criar synset imaginario
            if len(answer) == 0:
                nana += m[i][k] + ":1[" + str(count) + ":" + str(1) + "] "
                synsets[imag] = count
                extra_synsets[imag] = dictio.get(int(m[i][k].split(":")[0]))
                #g.write(m[i][k]+":1[" +str(imag)+":"+str(1)+"] ")
                count += 1
                imag -= 1
                continue

            #print ambiguous
            #print total
            #print answer
            #caso nenhum dos synsets tenha overlap ir ver ao semcor as suas probabilidades
            if total == 0:
                #print 'ZERO'
                count2 = 0
                #for n in range(len(answer)):
                #	if dic_g.has_key(str(answer[n][1].offset)):
                #		words = dic_g.get(str(answer[n][1].offset))
                #		for j in range(len(words)):
                #			if words[j].split(":")[0] == m[i][k].split(":")[0]:
                #				count2 += 1
                #if count2 == 0:
                #	nana += m[i][k]+":1[" +str(count)+":"+str(1)+"] "
                #	synsets[imag] = count
                #	extra_synsets[imag] = dictio.get(int(m[i][k].split(":")[0]))
                #g.write(m[i][k]+":1[" +str(imag)+":"+str(1)+"] ")
                #	count += 1
                #	imag -= 1
                #	continue
                s = wn.synsets(
                    dictio.get(int(m[i][k].split(":")[0])).split("_")[0],
                    penn_to_wn(
                        dictio.get(int(m[i][k].split(":")[0])).split("_")[1]))
                nana += m[i][k] + ':' + str(len(s)) + '['
                c = 1
                prob = 1.0 / len(s)
                for n in range(len(s)):
                    #print answer[n][1].offset
                    #print 'Coco ' + str(s[n].offset)
                    #if dic_g.has_key(str(s[n].offset)):
                    #words = dic_g.get(str(s[n].offset))
                    #for j in range(len(words)):
                    #	if words[j].split(":")[0] == m[i][k].split(":")[0]:
                    #		aux = 0
                    a = (s[n].offset())
                    #print s[n].offset()
                    if synsets.has_key(a):
                        aux = synsets.get(a)
                    else:
                        synsets[a] = count
                        aux = count
                        count += 1
                    if n == len(s) - 1:
                        nana += str(aux) + ':' + str(prob) + '] '
                    else:
                        nana += str(aux) + ':' + str(prob) + ' '

                #print nana
                continue
            #contar quantos synsets e que nao estao a zero
            count2 = 0
            for j in range(len(answer)):
                if answer[j][0] == 0:
                    continue
                else:
                    count2 += 1
            c = 1
            nana += m[i][k] + ':' + str(count2) + '['
            for j in range(len(answer)):
                #words_synsets = words_probs.get(int(m[i][k].split(':')[0]))
                #s.write(answer[j][1].offset+"\n")
                if answer[j][0] == 0:
                    continue
                aux = 0
                a = (answer[j][1].offset())
                #print 'Coco '+ str(answer[j][1].offset())
                if synsets.has_key(a):
                    aux = synsets.get(a)
                else:
                    synsets[a] = count
                    aux = count
                    count += 1
                prob_s = 0.0
                prob_w = 0.0
                prob_s_w = float(answer[j][0]) / total

                #if synset_count.has_key(str(answer[j][1].offset)):
                #	prob_s = synset_count.get(str(answer[j][1].offset))/float(sum_synset)
                #else:
                #	prob_s = 0.1
                prob_s_s = 1.0 / count2

                #if word_count.has_key(dictio.get(int(m[i][k].split(":")[0]))):
                #	prob_w = word_count.get(dictio.get(int(m[i][k].split(":")[0])))/float(sum_word)
                #else:
                #	prob_w = 0.1

                if word_count_corpus.has_key(
                        dictio.get(int(m[i][k].split(":")[0])).split("_")[0]):
                    prob_w = word_count_corpus.get(
                        dictio.get(int(m[i][k].split(":")[0])).split("_")
                        [0]) / float(sum_word_corpus)
                else:
                    prob_w = 0.1
                prob_w_s = (prob_w * prob_s_w) / prob_s_s

                if j == len(answer) - 1 or count2 == c:
                    if prob_w_s > 1.0:
                        #print 'Word: 'dictio.get(int(m[i][k].split(":")[0])) + ' Synset: ' + str(answer[j][1])
                        p.write('Word: ' +
                                dictio.get(int(m[i][k].split(":")[0])) +
                                ' Synset: ' + str(answer[j][1]))
                        #print 'Synsets disambiguated: ' + str(answer)
                        p.write('---- Synsets disambiguated: ' + str(answer))
                        #print synset_count.get(str(answer[j][1].offset)), word_count.get(dictio.get(int(m[i][k].split(":")[0]))), sum_synset, sum_word
                        #print 'P(s)=' +prob_s +', P(w)='+prob_w  +', P(s|w)='+ prob_s_w  +', P(w|s)='+ prob_w_s
                        p.write('---- P(s)=' + str(prob_s) + ', P(w)=' +
                                str(prob_w) + ', P(s|w)=' + str(prob_s_w) +
                                ', P(w|s)=' + str(prob_w_s))
                        p.write("\n")
                        nana += str(aux) + ':' + str(1) + '] '
                    #nana += str(aux) + ':' + str(words_synsets.get(answer[j][1].offset)) + '] '
                    else:
                        nana += str(aux) + ':' + str(prob_w_s) + '] '
                    #g.write(str(aux) + ':' + str(float(answer[j][0]/total)) + '] ')
                else:
                    c += 1
                    if prob_w_s > 1.0:
                        #print 'Word: 'dictio.get(int(m[i][k].split(":")[0])) + ' Synset: ' + str(answer[j][1])
                        p.write('Word: ' +
                                dictio.get(int(m[i][k].split(":")[0])) +
                                ' Synset: ' + str(answer[j][1]))
                        #print 'Synsets disambiguated: ' + str(answer)
                        p.write('---- Synsets disambiguated: ' + str(answer))
                        #print synset_count.get(str(answer[j][1].offset)), word_count.get(dictio.get(int(m[i][k].split(":")[0]))), sum_synset, sum_word
                        #print 'P(s)=' +prob_s +', P(w)='+prob_w  +', P(s|w)='+ prob_s_w  +', P(w|s)='+ prob_w_s
                        p.write('---- P(s)=' + str(prob_s) + ', P(w)=' +
                                str(prob_w) + ', P(s|w)=' + str(prob_s_w) +
                                ', P(w|s)=' + str(prob_w_s))
                        p.write("\n")
                        nana += str(aux) + ':' + str(1) + '] '
                    #nana += str(aux) + ':' + str(words_synsets.get(answer[j][1].offset)) +' '
                    else:
                        nana += str(aux) + ':' + str(prob_w_s) + ' '
                    #g.write(str(aux) + ':' + str(float(answer[j][0]/total)) +' ')
        nana += '\n'
        #print nana
        #return
        to_write.append(nana)
コード例 #26
0
from pattern.en import tag
import nltk
from nltk.corpus import wordnet as wn
from nltk.wsd import lesk
nltk.data.path.append('/media/santhosh/Data/workspace/nltk_data')


# for word, pos in tag('I feel *happy*!'):
#     print word, pos
# s = parsetree('The cat sat on the mat.', relations=True, lemmata=True)
# print repr(s)

# from pattern.en import parse
# s = 'This is my sample'
# s = parse(s, relations=True, lemmata=True)
# print s

from pywsd import lesk as lsk
from nltk.corpus import wordnet as wn
from nltk.corpus.reader.wordnet import Synset

data = lsk.adapted_lesk(u'I killed Cricket', u'Cricket')
ranked_synsets = data
probs = 0.0
for ranked_synset in ranked_synsets:
    prob, syn = ranked_synset
    print prob, syn.name()
    probs += prob
print probs

コード例 #27
0
def wsd_lesk(raw_df, algorithm_choice):
    """This finds the synset of the word using
        the original sentence as context and
        different lesk algorithms from nltk-
        and pywsd-packages.

        Algorithm choices are: 1. nltk's lesk
        2. pywsd simple_lesk, 3. pywsd advanced_lesk."""
    start = timer()
    algorithm_dict = {1: "nltk_lesk", 2: "pywsd_simple_lesk",
                      3: "pywsd_advanced_lesk", 4: "pywsd_cosine_lesk"}
    df = raw_df
    full_aspect_synset_list = []
    full_aspect_synset_list_definition = []
    aspect_synset_list_definition = []
    aspect_synset_list = []
    opinion_synset_list = []
    opinion_synset_list_definition = []
    full_opinion_synset_list = []
    full_opinion_synset_list_definition = []
    aspect_opinion = ["aspect_tags", "opinion_tags"]
    tokenized_sentences = raw_df["tokenized_sentence"]
    non_tokenized_sentences = raw_df["original_text"]

    for opinion_list in aspect_opinion:
        for i, phrase in enumerate(df[opinion_list]):
            multiple_word_found = False
            for j, word in enumerate(phrase):
                special_word = False
                if multiple_word_found is False:
                    # Check here for special words such as "bug".
                    aspect = check_for_special_word(word)
                    if aspect is not None:
                        special_word = True
                    wn_check = []
                    if len(phrase) >= 2:
                        k = 0
                        temporary_combined_word = []
                        while k < len(phrase):
                            temporary_combined_word.append(phrase[k][0])
                            k += 1
                        combined_word_string = '_'.join(temporary_combined_word)
                        wn_check = wn.synsets(combined_word_string, pos=find_wordnet_pos(word[1]))
                        multiple_word_found = True
                    if len(wn_check) == 0:
                        wn_check = wn.synsets(word[0], pos=find_wordnet_pos(word[1]))
                        multiple_word_found = False
                    if len(wn_check) > 0:
                        if special_word is False:
                            if algorithm_choice == 1:
                                if multiple_word_found is True:
                                    aspect = lesk(tokenized_sentences[i], combined_word_string, find_wordnet_pos(word[1]))
                                else:
                                    aspect = lesk(tokenized_sentences[i], word[0], find_wordnet_pos(word[1]))
                            if algorithm_choice == 2:
                                if multiple_word_found is True:
                                    aspect = pylesk.simple_lesk(non_tokenized_sentences[i], combined_word_string, find_wordnet_pos(word[1]))
                                else:
                                    aspect = pylesk.simple_lesk(non_tokenized_sentences[i], word[0], find_wordnet_pos(word[1]))
                            if algorithm_choice == 3:
                                if multiple_word_found is True:
                                    aspect = pylesk.adapted_lesk(non_tokenized_sentences[i], combined_word_string,
                                                             find_wordnet_pos(word[1]))
                                else:
                                    aspect = pylesk.adapted_lesk(non_tokenized_sentences[i], word[0], find_wordnet_pos(word[1]))
                            if algorithm_choice == 4:
                                if multiple_word_found is True:
                                    aspect = pylesk.cosine_lesk(non_tokenized_sentences[i], combined_word_string,
                                                            find_wordnet_pos(word[1]))
                                else:
                                    aspect = pylesk.cosine_lesk(non_tokenized_sentences[i], word[0], find_wordnet_pos(word[1]))
                        if aspect is not None:
                            if opinion_list is "aspect_tags":
                                aspect_synset_list.append(aspect)
                                aspect_synset_list_definition.append(aspect.definition())
                            else:
                                opinion_synset_list.append(aspect)
                                opinion_synset_list_definition.append(aspect.definition())
            if opinion_list is "aspect_tags":
                full_aspect_synset_list.append(aspect_synset_list)
                full_aspect_synset_list_definition.append(aspect_synset_list_definition)
                aspect_synset_list = []
                aspect_synset_list_definition = []
            else:
                full_opinion_synset_list.append(opinion_synset_list)
                full_opinion_synset_list_definition.append(opinion_synset_list_definition)
                opinion_synset_list = []
                opinion_synset_list_definition = []
    df[algorithm_dict[algorithm_choice] + "_aspect_synset"] = pd.Series(full_aspect_synset_list).values
    df[algorithm_dict[algorithm_choice] + "_aspect_definition"] = pd.Series(full_aspect_synset_list_definition).values
    df[algorithm_dict[algorithm_choice] + "_opinion_synset"] = pd.Series(full_opinion_synset_list).values
    df[algorithm_dict[algorithm_choice] + "_opinion_definition"] = pd.Series(full_opinion_synset_list_definition).values
    end = timer()
    logging.debug("WSD Lesk Time: %.2f seconds" % (end - start))
    return df
コード例 #28
0
                sentences.append(z)

        all_sentences.append(sentences)
        sentences_tagged = []
        sentence = 1

        # Add POS tags to all the sentences and extracting the dependencies
        verbs = set()
        children = []

        # spacy.displacy.serve(sentences[25], style='dep')
        for x in sentences:
            word = 0
            sentence_data = []
            for token in x:
                synset = adapted_lesk(str(x), token.text)
                synset = str(synset)

                if synset != "None":
                    token_synset = synset.split('(', 1)[1].split(')')[0]
                    token_synset = token_synset[1:-1]
                else:
                    token_synset = "None"
                sentence_data.append([
                    token.text, token.pos_, token.dep_, sentence, token.i,
                    token.head.i, token.head, token.lemma_, token_synset
                ])
                word = word + 1

            sentences_tagged.append(
                pd.DataFrame(sentence_data,
コード例 #29
0
print "Context:", plant_sents[0]
answer = simple_lesk(plant_sents[0],'plant','n', True, \
                     nbest=True, keepscore=True, normalizescore=True)
print "Senses ranked by #overlaps:", answer
best_sense = answer[0][1]
definition = best_sense.definition() 
#except: definition = best_sense.definition
print "Definition:", definition
print

print "======== TESTING adapted_lesk ===========\n"
from pywsd.lesk import adapted_lesk

print "#TESTING adapted_lesk() ..."
print "Context:", bank_sents[0]
answer = adapted_lesk(bank_sents[0],'bank')
print "Sense:", answer
definition = answer.definition()
#except: definition = answer.definition
print "Definition:", definition
print

print "#TESTING adapted_lesk() with pos, stem, nbest and scores."
print "Context:", bank_sents[0]
answer = adapted_lesk(bank_sents[0],'bank','n', True, \
                     nbest=True, keepscore=True)
print "Senses ranked by #overlaps:", answer
best_sense = answer[0][1]
definition = best_sense.definition() 
#except: definition = best_sense.definition
print "Definition:", definition
コード例 #30
0
ファイル: wsd.py プロジェクト: Shoop123/Reezy-NLP-Django
def get_disambiguated_synset(sentence, word, pos):
    translated_pos = get_wordnet_pos(pos)
    synset = adapted_lesk(sentence, word, pos=translated_pos)
    return synset
コード例 #31
0
                sentences.append(z)

        all_sentences.append(sentences)
        sentences_tagged = []
        sentence = 1

        # Add POS tags to all the sentences and extracting the dependencies
        verbs = set()
        children = []

        # spacy.displacy.serve(sentences[25], style='dep')
        for x in sentences:
            word = 0
            sentence_data = []
            for token in x:
                synset = adapted_lesk(str(x), token.text)
                synset = str(synset)

                if synset != "None":
                    token_synset = synset.split('(', 1)[1].split(')')[0]
                    token_synset = token_synset[1:-1]
                else:
                    token_synset = "None"
                sentence_data.append([
                    token.text, token.pos_, token.dep_, sentence, token.i,
                    token.head.i, token.head, token.lemma_, token_synset
                ])
                word = word + 1

            sentences_tagged.append(
                pd.DataFrame(sentence_data,